11 #ifndef __CUDACC_RTC__
14 #include <type_traits>
20 #include <index_helper.cuh>
22 #include <type_traits>
26 #include <index_helper.cuh>
27 #include <trove_helper.cuh>
43 template <
typename Float,
typename T>
72 __device__ __host__
inline void operator=(
const M &a) {
81 template <
typename T,
int N>
84 a.gauge.load(data, a.x_cb, a.dim, a.parity, a.phase);
91 template <
typename T,
int N>
94 a.gauge.load(data, a.x_cb, a.dim, a.parity, a.phase);
108 template <
typename Float,
typename T>
147 template <
typename T,
int N>
148 template <
typename S>
150 a.gauge.loadGhost(data, a.ghost_idx, a.dim, a.parity, a.phase);
157 template <
typename T,
int N>
158 template <
typename S>
160 a.gauge.loadGhost(data, a.ghost_idx, a.dim, a.parity, a.phase);
165 template<
typename ReduceType,
typename Float>
struct square_ {
168 {
return static_cast<ReduceType
>(
norm(x)); }
171 template <
typename ReduceType>
struct square_<ReduceType, int8_t> {
173 square_(
const ReduceType scale) : scale(scale) { }
178 template<
typename ReduceType>
struct square_<ReduceType,short> {
180 square_(
const ReduceType scale) : scale(scale) { }
185 template<
typename ReduceType>
struct square_<ReduceType,int> {
187 square_(
const ReduceType scale) : scale(scale) { }
192 template<
typename Float,
typename storeFloat>
struct abs_ {
218 template <
typename Float,
typename storeFloat> __host__ __device__
inline constexpr
bool fixed_point() {
return false; }
223 template <
typename Float,
typename storeFloat> __host__ __device__
inline constexpr
bool match() {
return false; }
224 template<> __host__ __device__
inline constexpr
bool match<int,int>() {
return true; }
234 template <
typename Float,
typename storeFloat>
246 static constexpr
bool fixed = fixed_point<Float, storeFloat>();
281 __device__ __host__
inline auto data() {
return &
v[
idx]; }
283 __device__ __host__
inline const auto data()
const {
return &
v[
idx]; }
309 if (match<storeFloat, theirFloat>()) {
322 if (match<storeFloat, theirFloat>()) {
335 if (match<storeFloat, theirFloat>()) {
343 template<
typename Float,
typename storeFloat>
350 template<
typename Float,
typename storeFloat>
356 template<
typename Float,
typename storeFloat>
362 template <
typename Float,
int nColor, QudaGaugeFieldOrder order,
typename storeFloat>
struct Accessor {
366 errorQuda(
"Not implemented for order=%d", order);
376 template <
typename Float,
int nColor, QudaGaugeFieldOrder order,
bool native_ghost,
typename storeFloat>
380 errorQuda(
"Not implemented for order=%d", order);
390 template <
typename Float,
int nColor,
typename storeFloat>
399 static constexpr
bool fixed = fixed_point<Float,storeFloat>();
402 : volumeCB(U.VolumeCB()), geometry(U.Geometry()), cb_offset((U.Bytes()>>1) / (sizeof(
complex<storeFloat>)*U.Geometry())),
403 scale(static_cast<
Float>(1.0)), scale_inv(static_cast<
Float>(1.0))
412 volumeCB(a.volumeCB),
413 geometry(a.geometry),
414 cb_offset(a.cb_offset),
416 scale_inv(a.scale_inv)
423 scale =
static_cast<Float>(std::numeric_limits<storeFloat>::max()) / max;
424 scale_inv = max /
static_cast<Float>(std::numeric_limits<storeFloat>::max());
443 template<
typename theirFloat>
449 if (fixed && !match<storeFloat,theirFloat>()) {
451 atomicAdd(u2, (vec2&)val_);
453 atomicAdd(u2, (vec2&)val);
456 if (fixed && !match<storeFloat,theirFloat>()) {
458 #pragma omp atomic update
460 #pragma omp atomic update
463 #pragma omp atomic update
465 #pragma omp atomic update
471 template <
typename helper,
typename reducer>
474 if (
dim >= geometry)
errorQuda(
"Request dimension %d exceeds dimensionality of the field %d",
dim, geometry);
475 int lower = (
dim == -1) ? 0 :
dim;
476 int ndim = (
dim == -1 ? geometry : 1);
477 std::vector<double> result(ndim);
478 std::vector<complex<storeFloat> *> v(ndim);
479 for (
int d = 0; d < ndim; d++) v[d] = u[d + lower];
482 for (
auto &res : result) total = r(total, res);
487 template <
typename Float,
int nColor,
bool native_ghost,
typename storeFloat>
493 static constexpr
bool fixed = fixed_point<Float,storeFloat>();
496 : scale(static_cast<
Float>(1.0)), scale_inv(static_cast<
Float>(1.0)) {
497 for (
int d=0; d<4; d++) {
513 scale_inv(a.scale_inv)
515 for (
int d = 0; d < 8; d++) {
516 ghost[d] = a.
ghost[d];
523 scale =
static_cast<Float>(std::numeric_limits<storeFloat>::max()) / max;
524 scale_inv = max /
static_cast<Float>(std::numeric_limits<storeFloat>::max());
543 template <
typename Float,
int nColor,
typename storeFloat>
551 static constexpr
bool fixed = fixed_point<Float,storeFloat>();
554 : u(gauge_ ? static_cast<
complex<storeFloat>*>(gauge_) :
555 static_cast<
complex<storeFloat>*>(const_cast<void *>(U.Gauge_p()))),
556 volumeCB(U.VolumeCB()), geometry(U.Geometry()),
557 scale(static_cast<
Float>(1.0)), scale_inv(static_cast<
Float>(1.0)) {
563 volumeCB(a.volumeCB),
564 geometry(a.geometry),
566 scale_inv(a.scale_inv)
571 scale =
static_cast<Float>(std::numeric_limits<storeFloat>::max()) / max;
572 scale_inv = max /
static_cast<Float>(std::numeric_limits<storeFloat>::max());
592 __device__ __host__
inline const auto wrap(
int d,
int parity,
int x,
int row,
int col)
const
595 u, (((
parity * volumeCB + x) * geometry + d) *
nColor + row) *
nColor + col, scale, scale_inv);
601 __device__ __host__
inline auto wrap(
int d,
int parity,
int x,
int row,
int col)
604 u, (((
parity * volumeCB + x) * geometry + d) *
nColor + row) *
nColor + col, scale, scale_inv);
609 (u, (((
parity*volumeCB+x)*geometry + d)*
nColor + row)*
nColor + col, scale, scale_inv); }
611 template <
typename theirFloat>
615 vec2 *u2 =
reinterpret_cast<vec2*
>(u + (((
parity*volumeCB+x_cb)*geometry +
dim)*
nColor + row)*
nColor + col);
616 if (fixed && !match<storeFloat,theirFloat>()) {
618 atomicAdd(u2, (vec2&)val_);
620 atomicAdd(u2, (vec2&)val);
623 if (fixed && !match<storeFloat,theirFloat>()) {
625 #pragma omp atomic update
627 #pragma omp atomic update
630 #pragma omp atomic update
631 u[(((
parity*volumeCB+x_cb)*geometry +
dim)*
nColor + row)*
nColor + col].x +=
static_cast<storeFloat
>(val.x);
632 #pragma omp atomic update
633 u[(((
parity*volumeCB+x_cb)*geometry +
dim)*
nColor + row)*
nColor + col].y +=
static_cast<storeFloat
>(val.y);
638 template <
typename helper,
typename reducer>
641 if (
dim >= geometry)
errorQuda(
"Request dimension %d exceeds dimensionality of the field %d",
dim, geometry);
644 std::vector<double> result = {
init,
init};
645 std::vector<decltype(u)> v = {u + (0 * geometry +
start) * volumeCB *
nColor *
nColor,
648 return r(result[0], result[1]);
652 template <
typename Float,
int nColor,
bool native_ghost,
typename storeFloat>
658 static constexpr
bool fixed = fixed_point<Float,storeFloat>();
661 : scale(static_cast<
Float>(1.0)), scale_inv(static_cast<
Float>(1.0)) {
662 for (
int d=0; d<4; d++) {
678 scale_inv(a.scale_inv)
680 for (
int d = 0; d < 8; d++) {
681 ghost[d] = a.
ghost[d];
688 scale =
static_cast<Float>(std::numeric_limits<storeFloat>::max()) / max;
689 scale_inv = max /
static_cast<Float>(std::numeric_limits<storeFloat>::max());
707 __device__ __host__
inline const auto wrap(
int d,
int parity,
int x,
int row,
int col)
const
710 ghost[d],
parity * ghostOffset[d] + (x *
nColor + row) *
nColor + col, scale, scale_inv);
716 __device__ __host__
inline auto wrap(
int d,
int parity,
int x,
int row,
int col)
719 ghost[d],
parity * ghostOffset[d] + (x *
nColor + row) *
nColor + col, scale, scale_inv);
724 (ghost[d],
parity*ghostOffset[d] + (x*
nColor + row)*
nColor + col, scale, scale_inv); }
727 template<
int nColor,
int N>
728 __device__ __host__
inline int indexFloatN(
int dim,
int parity,
int x_cb,
int row,
int col,
int stride,
int offset_cb) {
730 int j = ((row*
nColor+col)*2) / N;
731 int i = ((row*
nColor+col)*2) % N;
732 int index = ((x_cb +
dim*stride*M + j*stride)*2+i) / 2;
733 index +=
parity*offset_cb;
737 template <
typename Float,
int nColor,
typename storeFloat>
748 static constexpr
bool fixed = fixed_point<Float,storeFloat>();
751 : u(gauge_ ? static_cast<
complex<storeFloat>*>(gauge_) :
752 static_cast<
complex<storeFloat>*>(const_cast<void*>(U.Gauge_p()))),
753 offset_cb( (U.Bytes()>>1) / sizeof(
complex<storeFloat>)),
754 volumeCB(U.VolumeCB()), stride(U.Stride()), geometry(U.Geometry()),
755 max(static_cast<
Float>(1.0)), scale(static_cast<
Float>(1.0)), scale_inv(static_cast<
Float>(1.0))
762 offset_cb(a.offset_cb),
763 volumeCB(a.volumeCB),
765 geometry(a.geometry),
767 scale_inv(a.scale_inv)
774 scale =
static_cast<Float>(std::numeric_limits<storeFloat>::max()) / max;
775 scale_inv = max /
static_cast<Float>(std::numeric_limits<storeFloat>::max());
796 template <
typename theirFloat>
801 if (fixed && !match<storeFloat,theirFloat>()) {
803 atomicAdd(u2, (vec2&)val_);
805 atomicAdd(u2, (vec2&)val);
808 if (fixed && !match<storeFloat,theirFloat>()) {
810 #pragma omp atomic update
812 #pragma omp atomic update
815 #pragma omp atomic update
817 #pragma omp atomic update
823 template <
typename helper,
typename reducer>
826 if (
dim >= geometry)
errorQuda(
"Requested dimension %d exceeds dimensionality of the field %d",
dim, geometry);
829 std::vector<double> result = {
init,
init};
830 std::vector<decltype(u)> v = {u + 0 * offset_cb +
start * count, u + 1 * offset_cb +
start * count};
832 return r(result[0], result[1]);
836 template <
typename Float,
int nColor,
bool native_ghost,
typename storeFloat>
840 int ghostVolumeCB[8];
843 static constexpr
bool fixed = fixed_point<Float,storeFloat>();
847 volumeCB(U.VolumeCB()),
848 scale(static_cast<
Float>(1.0)),
849 scale_inv(static_cast<
Float>(1.0)),
850 accessor(U, gauge_, ghost_)
852 if (!native_ghost) assert(ghost_ !=
nullptr);
853 for (
int d=0; d<4; d++) {
863 volumeCB(a.volumeCB),
865 scale_inv(a.scale_inv),
868 for (
int d=0; d<8; d++) {
869 ghost[d] = a.
ghost[d];
877 scale =
static_cast<Float>(std::numeric_limits<storeFloat>::max()) / max;
878 scale_inv = max /
static_cast<Float>(std::numeric_limits<storeFloat>::max());
885 return accessor(d%4,
parity, x_cb+(d/4)*ghostVolumeCB[d]+volumeCB, row, col);
899 return accessor(d%4,
parity, x_cb+(d/4)*ghostVolumeCB[d]+volumeCB, row, col);
902 (ghost[d], ((
parity*
nColor + row)*
nColor+col)*ghostVolumeCB[d] + x_cb, scale, scale_inv);
920 typename storeFloat_ = Float_>
952 errorQuda(
"GaugeField ordering not supported with reconstruction");
965 static constexpr
bool fixedPoint() {
return fixed_point<Float,storeFloat>(); }
989 __device__ __host__
const auto wrap(
int d,
int parity,
int x)
const
1070 int c_row,
int c_col)
const
1086 int s_col,
int c_row,
int c_col)
1101 __device__ __host__
inline const auto wrap(
int d,
int parity,
int x,
int s_row,
int s_col)
const
1109 __device__ __host__
inline auto wrap(
int d,
int parity,
int x,
int s_row,
int s_col)
1141 int s_col,
int c_row,
int c_col)
1155 __device__ __host__
inline const auto wrap_ghost(
int d,
int parity,
int x,
int s_row,
int s_col)
const
1168 template <
typename theirFloat>
1184 __device__ __host__
inline int Ndim()
const {
return nDim; }
1190 __device__ __host__
inline int NspinCoarse()
const {
return nSpinCoarse; }
1200 __host__
double norm1(
int dim=-1,
bool global=
true)
const {
1212 __host__
double norm2(
int dim=-1,
bool global=
true)
const {
1255 template <
int N,
typename Float, QudaGhostExchange ghostExchange_, QudaStaggeredPhase = QUDA_STAGGERED_PHASE_NO>
1275 for (
int i = 0; i < N / 2; i++) {
1276 out[2 * i + 0] =
scale_inv * in[i].real();
1277 out[2 * i + 1] =
scale_inv * in[i].imag();
1281 for (
int i = 0; i < N / 2; i++) {
1282 out[2 * i + 0] = in[i].real();
1283 out[2 * i + 1] = in[i].imag();
1288 template <
typename I>
1290 const I *X,
const int *R)
const
1294 for (
int i = 0; i < N / 2; i++) { out[i] =
scale *
complex(in[2 * i + 0], in[2 * i + 1]); }
1297 for (
int i = 0; i < N / 2; i++) { out[i] =
complex(in[2 * i + 0], in[2 * i + 1]); }
1314 template <QudaGhostExchange ghostExchange_,
typename T,
typename I>
1316 T tBoundary, T scale,
int firstTimeSliceBound,
int lastTimeSliceBound,
bool isFirstTimeSlice,
1324 if (idx >= firstTimeSliceBound) {
1325 return isFirstTimeSlice ? tBoundary : scale;
1326 }
else if (idx >= lastTimeSliceBound) {
1327 return isLastTimeSlice ? tBoundary : scale;
1333 if (idx >= (R[3] - 1) * X[0] * X[1] * X[2] / 2 && idx < R[3] * X[0] * X[1] * X[2] / 2) {
1335 return isFirstTimeSlice ? tBoundary : scale;
1336 }
else if (idx >= (X[3] - R[3] - 1) * X[0] * X[1] * X[2] / 2 && idx < (X[3] - R[3]) * X[0] * X[1] * X[2] / 2) {
1338 return isLastTimeSlice ? tBoundary : scale;
1347 template <
typename Float,
typename I>
1352 case 0:
if ( ((x[3] - R[3]) & 1) != 0) sign = -
static_cast<Float>(1.0);
break;
1353 case 1:
if ( ((x[0] - R[0] + x[3] - R[3]) & 1) != 0) sign = -
static_cast<Float>(1.0);
break;
1354 case 2:
if ( ((x[0] - R[0] + x[1] - R[1] + x[3] - R[3]) & 1) != 0) sign = -
static_cast<Float>(1.0);
break;
1366 template <
typename Float, QudaGhostExchange ghostExchange_>
struct Reconstruct<12,
Float, ghostExchange_> {
1379 tBoundary(static_cast<
real>(u.TBoundary())),
1380 firstTimeSliceBound(u.VolumeCB()),
1381 lastTimeSliceBound((u.X()[3] - 1) * u.X()[0] * u.X()[1] * u.X()[2] / 2),
1382 isFirstTimeSlice(
comm_coord(3) == 0 ? true : false),
1384 ghostExchange(u.GhostExchange())
1390 tBoundary(recon.tBoundary),
1391 firstTimeSliceBound(recon.firstTimeSliceBound),
1392 lastTimeSliceBound(recon.lastTimeSliceBound),
1393 isFirstTimeSlice(recon.isFirstTimeSlice),
1394 isLastTimeSlice(recon.isLastTimeSlice),
1395 ghostExchange(recon.ghostExchange)
1402 for (
int i = 0; i < 6; i++) {
1403 out[2 * i + 0] = in[i].real();
1404 out[2 * i + 1] = in[i].imag();
1408 template <
typename I>
1410 const I *X,
const int *R)
const
1413 for (
int i = 0; i < 6; i++) out[i] =
complex(in[2 * i + 0], in[2 * i + 1]);
1415 const real u0 = dir < 3 ?
1417 timeBoundary<ghostExchange_>(idx, X, R, tBoundary,
static_cast<real>(1.0), firstTimeSliceBound,
1418 lastTimeSliceBound, isFirstTimeSlice, isLastTimeSlice, ghostExchange);
1421 out[6] =
cmul(out[2], out[4]);
1422 out[6] =
cmac(out[1], out[5], -out[6]);
1423 out[6] = u0 *
conj(out[6]);
1426 out[7] =
cmul(out[0], out[5]);
1427 out[7] =
cmac(out[2], out[3], -out[7]);
1428 out[7] = u0 *
conj(out[7]);
1431 out[8] =
cmul(out[1], out[3]);
1432 out[8] =
cmac(out[0], out[4], -out[8]);
1433 out[8] = u0 *
conj(out[8]);
1449 template <
typename Float, QudaGhostExchange ghostExchange_>
struct Reconstruct<11,
Float, ghostExchange_> {
1459 for (
int i = 0; i < 2; i++) {
1460 out[2 * i + 0] = in[i + 1].real();
1461 out[2 * i + 1] = in[i + 1].imag();
1463 out[4] = in[5].real();
1464 out[5] = in[5].imag();
1465 out[6] = in[0].imag();
1466 out[7] = in[4].imag();
1467 out[8] = in[8].imag();
1471 template <
typename I>
1473 const I *X,
const int *R)
const
1476 out[1] =
complex(in[0], in[1]);
1477 out[2] =
complex(in[2], in[3]);
1480 out[5] =
complex(in[4], in[5]);
1497 template <
typename Float, QudaGhostExchange ghostExchange_, QudaStaggeredPhase stag_phase>
1507 reconstruct_12(recon.reconstruct_12),
1515 reconstruct_12.
Pack(out, in, idx);
1518 template <
typename I>
1520 const I *X,
const int *R)
const
1523 for (
int i = 0; i < 6; i++) out[i] =
complex(in[2 * i + 0], in[2 * i + 1]);
1525 out[6] =
cmul(out[2], out[4]);
1526 out[6] =
cmac(out[1], out[5], -out[6]);
1529 out[7] =
cmul(out[0], out[5]);
1530 out[7] =
cmac(out[2], out[3], -out[7]);
1533 out[8] =
cmul(out[1], out[3]);
1534 out[8] =
cmac(out[0], out[4], -out[8]);
1541 complex A(cos_sin[0], cos_sin[1]);
1542 out[6] =
cmul(A, out[6]);
1543 out[7] =
cmul(A, out[7]);
1544 out[8] =
cmul(A, out[8]);
1557 complex expI3Phase = in[8] / denom;
1560 return arg(expI3Phase) /
static_cast<real>(3.0);
1562 return expI3Phase.real() > 0 ? 1 : -1;
1567 for (
int i = 0; i < 9; i++) a(i) =
scale_inv * in[i];
1569 return phase =
arg(det) / 3;
1581 template <
typename Float, QudaGhostExchange ghostExchange_>
struct Reconstruct<8,
Float, ghostExchange_> {
1595 tBoundary(static_cast<
real>(u.TBoundary()) *
scale, 1.0 / (static_cast<
real>(u.TBoundary()) *
scale)),
1596 firstTimeSliceBound(u.VolumeCB()),
1597 lastTimeSliceBound((u.X()[3] - 1) * u.X()[0] * u.X()[1] * u.X()[2] / 2),
1598 isFirstTimeSlice(
comm_coord(3) == 0 ? true : false),
1600 ghostExchange(u.GhostExchange())
1606 tBoundary(recon.tBoundary),
1607 firstTimeSliceBound(recon.firstTimeSliceBound),
1608 lastTimeSliceBound(recon.lastTimeSliceBound),
1609 isFirstTimeSlice(recon.isFirstTimeSlice),
1610 isLastTimeSlice(recon.isLastTimeSlice),
1611 ghostExchange(recon.ghostExchange)
1625 out[2] = in[4].real();
1626 out[3] = in[4].imag();
1627 out[4] = in[5].real();
1628 out[5] = in[5].imag();
1629 out[6] = in[0].real();
1630 out[7] = in[0].imag();
1633 template <
typename I>
1638 real u0_inv = u.imag();
1641 for (
int i = 1; i <= 3; i++)
1642 out[i] =
complex(in[2 * i + 0], in[2 * i + 1]);
1652 real row_sum = out[1].real() * out[1].real();
1653 row_sum += out[1].imag() * out[1].imag();
1654 row_sum += out[2].real() * out[2].real();
1655 row_sum += out[2].imag() * out[2].imag();
1656 real row_sum_inv =
static_cast<real>(1.0) / row_sum;
1658 real diff = u0_inv * u0_inv - row_sum;
1659 real U00_mag = diff > 0.0 ? diff * rsqrt(diff) :
static_cast<real>(0.0);
1664 real column_sum = out[0].real() * out[0].real();
1665 column_sum += out[0].imag() * out[0].imag();
1666 column_sum += out[3].real() * out[3].real();
1667 column_sum += out[3].imag() * out[3].imag();
1669 diff = u0_inv * u0_inv - column_sum;
1670 real U20_mag = diff > 0.0 ? diff * rsqrt(diff) :
static_cast<real>(0.0);
1675 real r_inv2 = u0_inv * row_sum_inv;
1681 out[4] =
cmac(u0 * A, out[1], out[4]);
1682 out[4] = -r_inv2 * out[4];
1686 out[5] =
cmac(-u0 * A, out[2], out[5]);
1687 out[5] = r_inv2 * out[5];
1695 out[7] =
cmac(-u0 * A, out[1], out[7]);
1696 out[7] = r_inv2 * out[7];
1700 out[8] =
cmac(u0 * A, out[2], out[8]);
1701 out[8] = -r_inv2 * out[8];
1707 for (
int i = 0; i < 3; i++) {
1708 const auto tmp = out[i];
1709 out[i] = out[i + 3];
1711 out[i + 6] = -out[i + 6];
1715 template <
typename I>
1716 __device__ __host__
inline void
1722 timeBoundary<ghostExchange_>(idx, X, R, tBoundary,
scale, firstTimeSliceBound, lastTimeSliceBound,
1723 isFirstTimeSlice, isLastTimeSlice, ghostExchange);
1738 template <
typename Float, QudaGhostExchange ghostExchange_, QudaStaggeredPhase stag_phase>
1749 reconstruct_8(recon.reconstruct_8),
1760 complex expI3Phase = in[8] / denom;
1762 return arg(expI3Phase) /
static_cast<real>(3.0);
1764 return expI3Phase.real() > 0 ? 1 : -1;
1769 for (
int i = 0; i < 9; i++) a(i) =
scale_inv * in[i];
1785 complex z(cos_sin[0], cos_sin[1]);
1788 for (
int i = 0; i < 9; i++) su3[i] =
cmul(z, in[i]);
1791 for (
int i = 0; i < 9; i++) { su3[i] = phase * in[i]; }
1793 reconstruct_8.
Pack(out, su3, idx);
1796 template <
typename I>
1798 const I *X,
const int *R)
const
1800 reconstruct_8.
Unpack(out, in, idx, dir, phase, X, R,
complex(
static_cast<real>(1.0),
static_cast<real>(1.0)),
1806 complex z(cos_sin[0], cos_sin[1]);
1809 for (
int i = 0; i < 9; i++) out[i] =
cmul(z, out[i]);
1812 for (
int i = 0; i < 18; i++) { out[i] *= phase; }
1817 __host__ __device__ constexpr
int ct_sqrt(
int n,
int i = 1)
1819 return n == i ? n : (i * i < n ?
ct_sqrt(n, i + 1) : i);
1832 template <QudaStaggeredPhase phase> __host__ __device__
inline bool static_phase()
1838 default:
return false;
1842 template <
typename Float,
int length,
int N,
int reconLenParam,
1854 static constexpr
int reconLen = (reconLenParam == 11) ? 10 : reconLenParam;
1873 gauge(gauge_ ? gauge_ : (
Float *)u.Gauge_p()),
1884 errorQuda(
"This accessor does not support coarse-link fields (lacks support for bidirectional ghost zone");
1888 for (
int i = 0; i < 4; i++) {
1891 ghost[i] = ghost_ ? ghost_[i] : 0;
1908 for (
int i = 0; i < 4; i++) {
1922 for (
int i=0; i<M; i++){
1927 for (
int j = 0; j < N; j++) copy(tmp[i * N + j], reinterpret_cast<Float *>(&vecTmp)[j]);
1932 if (static_phase<stag_phase>() && (
reconLen == 13 || use_inphase)) {
1936 phase *=
static_cast<real>(2.0) *
static_cast<real>(M_PI);
1950 for (
int i=0; i<M; i++){
1954 for (
int j=0; j<N; j++) copy(reinterpret_cast<Float*>(&vecTmp)[j],
tmp[i*N+j]);
1990 real phase = 1.0)
const
2005 for (
int i=0; i<M; i++) {
2007 Vector vecTmp = vector_load<Vector>(
2011 for (
int j = 0; j < N; j++) copy(tmp[i * N + j], reinterpret_cast<Float *>(&vecTmp)[j]);
2021 phase *=
static_cast<real>(2.0) *
static_cast<real>(M_PI);
2038 for (
int i=0; i<M; i++) {
2042 for (
int j=0; j<N; j++) copy(reinterpret_cast<Float*>(&vecTmp)[j],
tmp[i*N+j]);
2050 static_cast<real>(phase / (2. * M_PI)));
2082 real phase = 1.0)
const
2088 int dim,
int g,
int parity,
const int R[])
const
2094 for (
int i=0; i<M; i++) {
2100 for (
int j=0; j<N; j++) copy(tmp[i*N+j], reinterpret_cast<Float*>(&vecTmp)[j]);
2121 for (
int i=0; i<M; i++) {
2125 for (
int j=0; j<N; j++) copy(reinterpret_cast<Float*>(&vecTmp)[j],
tmp[i*N+j]);
2134 static_cast<real>(phase / (2. * M_PI)));
2166 template <
typename real,
int length>
struct S {
2168 __host__ __device__
const real &
operator[](
int i)
const {
return v[i]; }
2194 errorQuda(
"This accessor does not support coarse-link fields (lacks support for bidirectional ghost zone");
2196 for (
int i = 0; i < 4; i++) {
2208 for (
int i = 0; i < 4; i++) {
2216 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2223 for (
int i = 0; i <
length / 2; i++) v[i] =
complex(v_[2 * i + 0], v_[2 * i + 1]);
2228 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2232 for (
int i = 0; i <
length / 2; i++) {
2234 v_[2 * i + 1] = (
Float)v[i].imag();
2239 for (
int i = 0; i <
length / 2; i++) {
2241 v_[2 * i + 1] = (
Float)v[i].imag();
2273 real phase = 1.0)
const
2279 int parity,
const int R[])
const
2281 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2288 for (
int i = 0; i <
length / 2; i++) v[i] =
complex(v_[2 * i + 0], v_[2 * i + 1]);
2292 int g,
int parity,
const int R[])
2294 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2298 for (
int i = 0; i <
length / 2; i++) {
2300 v_[2 * i + 1] = (
Float)v[i].imag();
2305 for (
int i = 0; i <
length / 2; i++) {
2307 v_[2 * i + 1] = (
Float)v[i].imag();
2327 for(
int i=0; i<4; i++)
gauge[i] = order.
gauge[i];
2332 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2339 for (
int i = 0; i <
length / 2; i++) v[i] =
complex(v_[2 * i + 0], v_[2 * i + 1]);
2344 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2348 for (
int i = 0; i <
length / 2; i++) {
2350 v_[2 * i + 1] = (
Float)v[i].imag();
2355 for (
int i = 0; i <
length / 2; i++) {
2357 v_[2 * i + 1] = (
Float)v[i].imag();
2409 for(
int i=0; i<4; i++)
gauge[i] = order.
gauge[i];
2414 for (
int i = 0; i <
length / 2; i++) {
2422 for (
int i = 0; i <
length / 2; i++) {
2481 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2488 for (
int i = 0; i <
length / 2; i++) v[i] =
complex(v_[2 * i + 0], v_[2 * i + 1]);
2493 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2497 for (
int i = 0; i <
length / 2; i++) {
2498 v_[2 * i + 0] = v[i].real();
2499 v_[2 * i + 1] = v[i].imag();
2504 for (
int i = 0; i <
length / 2; i++) {
2505 v_[2 * i + 0] = v[i].real();
2506 v_[2 * i + 1] = v[i].imag();
2570 gauge(gauge_ ? gauge_ : (
Float *)u.Gauge_p()),
2576 if ((uintptr_t)((
char *)
gauge +
offset) % 16 != 0) {
errorQuda(
"MILC structure has misaligned offset"); }
2594 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2597 structure v_ = gauge_[dir];
2599 auto v_ = &gauge0[dir *
length];
2601 for (
int i = 0; i <
length / 2; i++) v[i] =
complex(v_[2 * i + 0], v_[2 * i + 1]);
2609 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2613 for (
int i = 0; i <
length / 2; i++) {
2614 v_[2 * i + 0] = v[i].real();
2615 v_[2 * i + 1] = v[i].imag();
2619 for (
int i = 0; i <
length / 2; i++) {
2620 gauge0[dir *
length + 2 * i + 0] = v[i].real();
2621 gauge0[dir *
length + 2 * i + 1] = v[i].imag();
2672 static constexpr
int Nc = 3;
2676 gauge(gauge_ ? gauge_ : (
Float *)u.Gauge_p()),
2698 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2705 for (
int i=0; i<
Nc; i++) {
2706 for (
int j=0; j<
Nc; j++) {
2714 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2718 for (
int i=0; i<
Nc; i++)
2719 for (
int j = 0; j <
Nc; j++) {
2726 for (
int i=0; i<
Nc; i++) {
2727 for (
int j=0; j<
Nc; j++) {
2782 static constexpr
int Nc = 3;
2785 gauge(gauge_ ? gauge_ : (
Float *)u.Gauge_p()),
2791 for (
int i=1; i<4; i++)
exVolumeCB *= u.
X()[i] + 2;
2805 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2812 for (
int i = 0; i <
Nc; i++) {
2813 for (
int j = 0; j <
Nc; j++) { v[i *
Nc + j] =
complex(v_[(j *
Nc + i) * 2 + 0], v_[(j *
Nc + i) * 2 + 1]); }
2819 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2823 for (
int i=0; i<
Nc; i++)
2824 for (
int j = 0; j <
Nc; j++) {
2825 v_[(j *
Nc + i) * 2 + 0] = v[i *
Nc + j].
real();
2826 v_[(j *
Nc + i) * 2 + 1] = v[i *
Nc + j].imag();
2831 for (
int i = 0; i <
Nc; i++) {
2832 for (
int j = 0; j <
Nc; j++) {
2833 v_[(j *
Nc + i) * 2 + 0] = v[i *
Nc + j].
real();
2834 v_[(j *
Nc + i) * 2 + 1] = v[i *
Nc + j].imag();
2883 static constexpr
int Nc = 3;
2888 gauge(gauge_ ? gauge_ : (
Float *)u.Gauge_p()),
2908 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2915 for (
int i = 0; i <
Nc; i++) {
2916 for (
int j = 0; j <
Nc; j++) {
2924 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
2928 for (
int i=0; i<
Nc; i++)
2929 for (
int j = 0; j <
Nc; j++) {
2931 v_[(j *
Nc + i) * 2 + 1] = v[i *
Nc + j].imag() *
scale;
2936 for (
int i = 0; i <
Nc; i++) {
2937 for (
int j = 0; j <
Nc; j++) {
2939 v_[(j *
Nc + i) * 2 + 1] = v[i *
Nc + j].imag() *
scale;
2989 static constexpr
int Nc = 3;
2996 gauge(gauge_ ? gauge_ : (
Float *)u.Gauge_p()),
3001 dim {u.X()[0], u.X()[1], u.X()[2], u.X()[3]},
3002 exDim {u.X()[0], u.X()[1], u.X()[2] + 4, u.X()[3]}
3035 return linkIndex(coord,
exDim);
3043 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
3050 for (
int i = 0; i <
Nc; i++) {
3051 for (
int j = 0; j <
Nc; j++) {
3061 #if defined( __CUDA_ARCH__) && !defined(DISABLE_TROVE)
3065 for (
int i=0; i<
Nc; i++)
3066 for (
int j = 0; j <
Nc; j++) {
3068 v_[(j *
Nc + i) * 2 + 1] = v[i *
Nc + j].imag() *
scale;
3073 for (
int i = 0; i <
Nc; i++) {
3074 for (
int j = 0; j <
Nc; j++) {
3076 v_[(j *
Nc + i) * 2 + 1] = v[i *
Nc + j].imag() *
scale;
3117 template <
typename otherFloat,
typename storeFloat>
3123 template <
typename otherFloat,
typename storeFloat>
3129 template <
typename otherFloat,
typename storeFloat>
3135 template <
typename otherFloat,
typename storeFloat>
3149 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3153 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3157 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3161 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3165 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3169 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3175 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3179 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3183 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3187 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3191 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3195 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3207 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3211 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3215 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3219 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3223 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3227 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3233 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3237 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3241 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3245 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3249 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
3253 template <
int N, QudaStaggeredPhase stag,
bool huge_alloc, QudaGhostExchange ghostExchange,
bool use_inphase>
QudaFieldGeometry Geometry() const
const void ** Ghost() const
QudaReconstructType Reconstruct() const
const int * SurfaceCB() const
__device__ __host__ Matrix()
__device__ __host__ void operator=(const Matrix< U, N > &b)
void comm_allreduce_min(double *data)
void comm_allreduce_max(double *data)
void comm_allreduce(double *data)
cudaColorSpinorField * tmp
enum QudaStaggeredPhase_s QudaStaggeredPhase
@ QUDA_STAGGERED_PHASE_NO
@ QUDA_STAGGERED_PHASE_TIFR
@ QUDA_STAGGERED_PHASE_CPS
@ QUDA_STAGGERED_PHASE_MILC
enum QudaGaugeFieldOrder_s QudaGaugeFieldOrder
enum QudaFieldLocation_s QudaFieldLocation
@ QUDA_GHOST_EXCHANGE_EXTENDED
@ QUDA_GHOST_EXCHANGE_INVALID
@ QUDA_GHOST_EXCHANGE_PAD
enum QudaGhostExchange_s QudaGhostExchange
enum QudaReconstructType_s QudaReconstructType
@ QUDA_FLOAT2_GAUGE_ORDER
@ QUDA_CPS_WILSON_GAUGE_ORDER
@ QUDA_NATIVE_GAUGE_ORDER
@ QUDA_TIFR_PADDED_GAUGE_ORDER
@ QUDA_QDPJIT_GAUGE_ORDER
#define safe_malloc(size)
void init()
Create the BLAS context.
void start()
Start profiling.
__host__ constexpr __device__ int ct_sqrt(int n, int i=1)
__host__ __device__ bool static_phase()
__host__ constexpr __device__ bool fixed_point()
__device__ __host__ T timeBoundary(int idx, const I X[QUDA_MAX_DIM], const int R[QUDA_MAX_DIM], T tBoundary, T scale, int firstTimeSliceBound, int lastTimeSliceBound, bool isFirstTimeSlice, bool isLastTimeSlice, QudaGhostExchange ghostExchange=QUDA_GHOST_EXCHANGE_NO)
timeBoundary Compute boundary condition correction
__host__ constexpr __device__ bool match()
__host__ constexpr __device__ bool match< int, int >()
__host__ constexpr __device__ bool fixed_point< float, int8_t >()
__device__ __host__ complex< Float > operator+(const fieldorder_wrapper< Float, storeFloat > &a, const complex< Float > &b)
__host__ constexpr __device__ bool fixed_point< float, short >()
__host__ constexpr __device__ bool match< short, short >()
__device__ __host__ int indexFloatN(int dim, int parity, int x_cb, int row, int col, int stride, int offset_cb)
__host__ constexpr __device__ int Ncolor(int length)
Return the number of colors of the accessor based on the length of the field.
__device__ __host__ complex< Float > operator*(const Float &a, const fieldorder_wrapper< Float, storeFloat > &b)
__host__ constexpr __device__ bool fixed_point< float, int >()
constexpr bool default_huge_alloc
__device__ __host__ Float milcStaggeredPhase(int dim, const int x[], const I R[])
__host__ __device__ ValueType conj(ValueType x)
void transform_reduce(Arg &arg)
__host__ __device__ complex< real > cmul(const complex< real > &x, const complex< real > &y)
__host__ __device__ complex< real > cmac(const complex< real > &x, const complex< real > &y, const complex< real > &z)
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
__device__ __host__ T getDeterminant(const Mat< T, 3 > &a)
__device__ __host__ void vector_store(void *ptr, int idx, const VectorType &value)
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
__host__ __device__ std::enable_if<!isFixed< T1 >::value &&!isFixed< T2 >::value, void >::type copy(T1 &a, const T2 &b)
Copy function which is trival between floating point types. When converting to an integer type,...
__host__ __device__ ValueType abs(ValueType x)
FloatingPoint< float > Float
#define qudaMemcpy(dst, src, count, kind)
#define QUDA_MAX_GEOMETRY
Maximum geometry supported by a field. This essentially is the maximum number of dimensions supported...
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5.
Provides precision abstractions and defines the register precision given the storage precision using ...
__host__ __device__ int8_t imag() const volatile
__host__ __device__ int8_t real() const volatile
__host__ __device__ int imag() const volatile
__host__ __device__ int real() const volatile
__host__ __device__ short real() const volatile
__host__ __device__ short imag() const volatile
__host__ __device__ complex(const ValueType &re=ValueType(), const ValueType &im=ValueType())
__host__ __device__ ValueType imag() const volatile
__host__ __device__ ValueType real() const volatile
__host__ __device__ complex< ValueType > & operator=(const complex< T > z)
Accessor(const GaugeField &U, void *gauge_=0, void **ghost_=0, bool override=false)
__device__ __host__ void atomic_add(int dim, int parity, int x_cb, int row, int col, const complex< theirFloat > &val) const
__device__ __host__ fieldorder_wrapper< Float, storeFloat > operator()(int dim, int parity, int x_cb, int row, int col)
__host__ double transform_reduce(QudaFieldLocation location, int dim, helper h, double init, reducer r) const
Accessor(const Accessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat > &a)
__device__ __host__ const complex< Float > operator()(int dim, int parity, int x_cb, int row, int col) const
complex< storeFloat > * u
void resetScale(Float max_)
__device__ __host__ const auto wrap(int d, int parity, int x, int row, int col) const
This and the following method creates a fieldorder_wrapper object whose pointer points to the start o...
__host__ double transform_reduce(QudaFieldLocation location, int dim, helper h, double init, reducer r) const
complex< storeFloat > * u
Accessor(const Accessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, storeFloat > &a)
Accessor(const GaugeField &U, void *gauge_=0, void **ghost_=0)
__device__ __host__ fieldorder_wrapper< Float, storeFloat > operator()(int d, int parity, int x, int row, int col)
__device__ __host__ auto wrap(int d, int parity, int x, int row, int col)
void resetScale(Float max)
__device__ __host__ complex< Float > operator()(int d, int parity, int x, int row, int col) const
__device__ __host__ void atomic_add(int dim, int parity, int x_cb, int row, int col, const complex< theirFloat > &val) const
__device__ __host__ void atomic_add(int dim, int parity, int x_cb, int row, int col, const complex< theirFloat > &val) const
Accessor(const Accessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, storeFloat > &a)
__host__ double transform_reduce(QudaFieldLocation location, int dim, helper h, double init, reducer r) const
__device__ __host__ fieldorder_wrapper< Float, storeFloat > operator()(int d, int parity, int x, int row, int col)
Accessor(const GaugeField &U, void *gauge_=0, void **ghost_=0)
void resetScale(Float max)
__device__ __host__ complex< Float > operator()(int d, int parity, int x, int row, int col) const
complex< storeFloat > * u[QUDA_MAX_GEOMETRY]
Accessor(const GaugeField &, void *gauge_=0, void **ghost_=0)
void resetScale(Float dummy)
__device__ __host__ complex< Float > & operator()(int d, int parity, int x, int row, int col) const
static constexpr bool is_mma_compatible
struct to define BQCD ordered gauge fields:
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
__device__ __host__ void load(complex v[9], int x, int dir, int parity, real inphase=1.0) const
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
BQCDOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
BQCDOrder(const BQCDOrder &order)
typename mapper< Float >::type real
__device__ __host__ void save(const complex v[9], int x, int dir, int parity)
const real anisotropy_inv
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
__device__ __host__ void save(const complex v[9], int x, int dir, int parity)
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
typename mapper< Float >::type real
CPSOrder(const CPSOrder &order)
__device__ __host__ void load(complex v[9], int x, int dir, int parity, Float inphase=1.0) const
CPSOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
__device__ __host__ int Geometry() const
const int_fastdiv geometry
static constexpr bool is_mma_compatible
__device__ __host__ int Volume() const
__device__ __host__ int Ncolor() const
__host__ double abs_min(int dim=-1, bool global=true) const
Returns the minimum absolute value of the field.
__device__ __host__ complex< Float > Ghost(int d, int parity, int x, int s_row, int s_col, int c_row, int c_col) const
__device__ __host__ void atomicAdd(int d, int parity, int x, int s_row, int s_col, int c_row, int c_col, const complex< theirFloat > &val)
GhostAccessor< Float, nColor, order, native_ghost, storeFloat > ghostAccessor
__device__ __host__ int NcolorCoarse() const
__device__ __host__ const auto wrap(int d, int parity, int x) const
This and the following method (eventually) creates a fieldorder_wrapper object whose pointer points t...
void resetScale(double max)
static constexpr int nColorCoarse
__device__ __host__ int Ndim() const
__device__ __host__ fieldorder_wrapper< Float, storeFloat > operator()(int d, int parity, int x, int s_row, int s_col, int c_row, int c_col)
FieldOrder(const FieldOrder &o)
__device__ __host__ const auto wrap(int d, int parity, int x, int s_row, int s_col) const
This and the following method (eventually) creates a fieldorder_wrapper object whose pointer points t...
__device__ __host__ int VolumeCB() const
__device__ __host__ const complex< Float > operator()(int d, int parity, int x, int s_row, int s_col, int c_row, int c_col) const
__device__ __host__ const auto wrap_ghost(int d, int parity, int x, int s_row, int s_col) const
This and the following method (eventually) creates a fieldorder_wrapper object whose pointer points t...
FieldOrder(GaugeField &U, void *gauge_=0, void **ghost_=0)
__device__ __host__ const auto wrap_ghost(int d, int parity, int x) const
This and the following method (eventually) creates a fieldorder_wrapper object whose pointer points t...
__device__ __host__ fieldorder_wrapper< Float, storeFloat > operator()(int d, int parity, int x, int row, int col)
static constexpr bool fixedPoint()
__host__ double norm2(int dim=-1, bool global=true) const
Returns the L2 norm squared of the field in a given dimension.
__device__ __host__ fieldorder_wrapper< Float, storeFloat > Ghost(int d, int parity, int x, int row, int col)
__device__ __host__ auto Ghost(int d, int parity, int x) const
static constexpr bool supports_ghost_zone
const QudaFieldLocation location
__device__ __host__ fieldorder_wrapper< Float, storeFloat > Ghost(int d, int parity, int x, int s_row, int s_col, int c_row, int c_col)
__device__ __host__ auto wrap(int d, int parity, int x, int s_row, int s_col)
the non-const wrap method.
__device__ __host__ auto wrap_ghost(int d, int parity, int x)
the non-const wrap_ghost method.
__device__ __host__ int NspinCoarse() const
__device__ __host__ complex< Float > Ghost(int d, int parity, int x, int row, int col) const
__device__ __host__ complex< Float > operator()(int d, int parity, int x, int row, int col) const
__host__ double abs_max(int dim=-1, bool global=true) const
Returns the Linfinity norm of the field in a given dimension.
__device__ __host__ auto wrap(int d, int parity, int x)
the non-const wrap method.
__device__ __host__ auto wrap_ghost(int d, int parity, int x, int s_row, int s_col)
the non-const wrap_ghost method.
__host__ double norm1(int dim=-1, bool global=true) const
Returns the L1 norm of the field in a given dimension.
__device__ __host__ gauge_ghost_wrapper< real, Accessor > Ghost(int dim, int ghost_idx, int parity, real phase=1.0)
This accessor routine returns a gauge_ghost_wrapper to this object, allowing us to overload various o...
__device__ __host__ void loadGhost(complex v[length/2], int x, int dir, int parity, real inphase=1.0) const
__device__ __host__ void saveGhost(const complex v[length/2], int x, int dir, int parity)
int_fastdiv X[QUDA_MAX_DIM]
__device__ __host__ void save(const complex v[length/2], int x, int dir, int parity)
Reconstruct< reconLenParam, Float, ghostExchange_, stag_phase > reconstruct
const AllocInt phaseOffset
size_t bytes
host memory for backing up the field when tuning
typename mapper< Float >::type real
FloatNOrder(const FloatNOrder &order)
__device__ __host__ void load(complex v[length/2], int x, int dir, int parity, real inphase=1.0) const
QudaGhostExchange ghostExchange
FloatNOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0, bool override=false)
void save()
Backup the field to the host when tuning.
__device__ __host__ const gauge_ghost_wrapper< real, Accessor > Ghost(int dim, int ghost_idx, int parity, real phase=1.0) const
This accessor routine returns a const gauge_ghost_wrapper to this object, allowing us to overload var...
AllocType< huge_alloc >::type AllocInt
__device__ __host__ void loadGhostEx(complex v[length/2], int buff_idx, int extended_idx, int dir, int dim, int g, int parity, const int R[]) const
static constexpr int reconLen
static constexpr int hasPhase
__device__ __host__ void saveGhostEx(const complex v[length/2], int buff_idx, int extended_idx, int dir, int dim, int g, int parity, const int R[])
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity, real phase=1.0) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity, real phase=1.0)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
void load()
Restore the field from the host after tuning.
VectorType< Float, N >::type Vector
Accessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, storeFloat > accessor
GhostAccessor(const GhostAccessor< Float, nColor, QUDA_FLOAT2_GAUGE_ORDER, native_ghost, storeFloat > &a)
__device__ __host__ const complex< Float > operator()(int d, int parity, int x_cb, int row, int col) const
__device__ __host__ fieldorder_wrapper< Float, storeFloat > operator()(int d, int parity, int x_cb, int row, int col)
GhostAccessor(const GaugeField &U, void *gauge_, void **ghost_=0)
complex< storeFloat > * ghost[8]
void resetScale(Float max)
__device__ __host__ complex< Float > operator()(int d, int parity, int x, int row, int col) const
__device__ __host__ auto wrap(int d, int parity, int x, int row, int col)
the non-const wrap method.
GhostAccessor(const GaugeField &U, void *gauge_=0, void **ghost_=0)
complex< storeFloat > * ghost[8]
void resetScale(Float max)
__device__ __host__ const auto wrap(int d, int parity, int x, int row, int col) const
The method similar to Accessor<Float, nColor, QUDA_MILC_GAUGE_ORDER, storeFloat>::wrap: this method a...
__device__ __host__ fieldorder_wrapper< Float, storeFloat > operator()(int d, int parity, int x, int row, int col)
GhostAccessor(const GhostAccessor< Float, nColor, QUDA_MILC_GAUGE_ORDER, native_ghost, storeFloat > &a)
__device__ __host__ complex< Float > operator()(int d, int parity, int x, int row, int col) const
GhostAccessor(const GhostAccessor< Float, nColor, QUDA_QDP_GAUGE_ORDER, native_ghost, storeFloat > &a)
__device__ __host__ fieldorder_wrapper< Float, storeFloat > operator()(int d, int parity, int x, int row, int col)
GhostAccessor(const GaugeField &U, void *gauge_=0, void **ghost_=0)
void resetScale(Float max)
complex< storeFloat > * ghost[8]
GhostAccessor(const GaugeField &, void *gauge_=0, void **ghost_=0)
void resetScale(Float dummy)
__device__ __host__ complex< Float > & operator()(int d, int parity, int x, int row, int col) const
The LegacyOrder defines the ghost zone storage and ordering for all cpuGaugeFields,...
__device__ __host__ void saveGhost(const complex v[length/2], int x, int dir, int parity)
__device__ __host__ const gauge_ghost_wrapper< real, Accessor > Ghost(int dim, int ghost_idx, int parity, real phase=1.0) const
This accessor routine returns a const gauge_ghost_wrapper to this object, allowing us to overload var...
int faceVolumeCB[QUDA_MAX_DIM]
__device__ __host__ void loadGhostEx(complex v[length/2], int x, int dummy, int dir, int dim, int g, int parity, const int R[]) const
__device__ __host__ gauge_ghost_wrapper< real, Accessor > Ghost(int dim, int ghost_idx, int parity, real phase=1.0)
This accessor routine returns a gauge_ghost_wrapper to this object, allowing us to overload various o...
typename mapper< Float >::type real
LegacyOrder(const LegacyOrder &order)
__device__ __host__ void loadGhost(complex v[length/2], int x, int dir, int parity, real phase=1.0) const
Float * ghost[QUDA_MAX_DIM]
__device__ __host__ void saveGhostEx(const complex v[length/2], int x, int dummy, int dir, int dim, int g, int parity, const int R[])
LegacyOrder(const GaugeField &u, Float **ghost_)
MILCOrder(const MILCOrder &order)
__device__ __host__ void load(complex v[length/2], int x, int dir, int parity, real inphase=1.0) const
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
MILCOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
__device__ __host__ void save(const complex v[length/2], int x, int dir, int parity)
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
struct to define gauge fields packed into an opaque MILC site struct:
__device__ __host__ void load(complex v[length/2], int x, int dir, int parity, real inphase=1.0) const
MILCSiteOrder(const MILCSiteOrder &order)
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
__device__ __host__ void save(const complex v[length/2], int x, int dir, int parity)
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
MILCSiteOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
__device__ __host__ void save(const complex v[length/2], int x, int dir, int parity)
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
__device__ __host__ void load(complex v[length/2], int x, int dir, int parity, real inphase=1.0) const
QDPJITOrder(const QDPJITOrder &order)
Float * gauge[QUDA_MAX_DIM]
typename mapper< Float >::type real
QDPJITOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
QDPOrder(const QDPOrder &order)
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
__device__ __host__ void load(complex v[length/2], int x, int dir, int parity, real inphase=1.0) const
__device__ __host__ void save(const complex v[length/2], int x, int dir, int parity)
QDPOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
typename mapper< Float >::type real
Float * gauge[QUDA_MAX_DIM]
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
Gauge reconstruct helper for Momentum field with 10 packed elements (really 9 from the Lie algebra,...
__device__ __host__ void Pack(real out[10], const complex in[9], int idx) const
Reconstruct(const Reconstruct< 11, Float, ghostExchange_ > &recon)
typename mapper< Float >::type real
__device__ __host__ real getPhase(const complex in[9])
__device__ __host__ void Unpack(complex out[9], const real in[10], int idx, int dir, real phase, const I *X, const int *R) const
Reconstruct(const GaugeField &u)
Gauge reconstruct 12 helper where we reconstruct the third row from the cross product of the first tw...
__device__ __host__ void Unpack(complex out[9], const real in[12], int idx, int dir, real phase, const I *X, const int *R) const
__device__ __host__ void Pack(real out[12], const complex in[9], int idx) const
const int firstTimeSliceBound
const int lastTimeSliceBound
const bool isLastTimeSlice
Reconstruct(const GaugeField &u)
const bool isFirstTimeSlice
typename mapper< Float >::type real
QudaGhostExchange ghostExchange
Reconstruct(const Reconstruct< 12, Float, ghostExchange_ > &recon)
__device__ __host__ real getPhase(const complex in[9])
Gauge reconstruct 13 helper where we reconstruct the third row from the cross product of the first tw...
Reconstruct(const GaugeField &u)
const Reconstruct< 12, Float, ghostExchange_ > reconstruct_12
typename mapper< Float >::type real
__device__ __host__ void Pack(real out[12], const complex in[9], int idx) const
Reconstruct(const Reconstruct< 13, Float, ghostExchange_, stag_phase > &recon)
__device__ __host__ real getPhase(const complex in[9]) const
__device__ __host__ void Unpack(complex out[9], const real in[12], int idx, int dir, real phase, const I *X, const int *R) const
Gauge reconstruct 8 helper where we reconstruct the gauge matrix from 8 packed elements (maximal comp...
QudaGhostExchange ghostExchange
__device__ __host__ void Unpack(complex out[9], const real in[8], int idx, int dir, real phase, const I *X, const int *R, const complex scale, const complex u) const
Reconstruct(const Reconstruct< 8, Float, ghostExchange_ > &recon)
Reconstruct(const GaugeField &u, real scale=1.0)
typename mapper< Float >::type real
const bool isLastTimeSlice
__device__ __host__ real getPhase(const complex in[9])
__device__ __host__ void Pack(real out[8], const complex in[9], int idx) const
const int firstTimeSliceBound
__device__ __host__ void Unpack(complex out[9], const real in[8], int idx, int dir, real phase, const I *X, const int *R, const complex scale=complex(static_cast< real >(1.0), static_cast< real >(1.0))) const
const bool isFirstTimeSlice
const int lastTimeSliceBound
Gauge reconstruct 9 helper where we reconstruct the gauge matrix from 8 packed elements (maximal comp...
__device__ __host__ void Unpack(complex out[9], const real in[8], int idx, int dir, real phase, const I *X, const int *R) const
Reconstruct(const GaugeField &u)
__device__ __host__ void Pack(real out[8], const complex in[9], int idx) const
__device__ __host__ real getPhase(const complex in[9]) const
Reconstruct(const Reconstruct< 9, Float, ghostExchange_, stag_phase > &recon)
typename mapper< Float >::type real
const Reconstruct< 8, Float, ghostExchange_ > reconstruct_8
Generic reconstruction helper with no reconstruction.
Reconstruct(const Reconstruct< N, Float, ghostExchange_ > &recon)
__device__ __host__ real getPhase(const complex in[N/2]) const
__device__ __host__ void Unpack(complex out[N/2], const real in[N], int idx, int dir, real phase, const I *X, const int *R) const
Reconstruct(const GaugeField &u)
__device__ __host__ void Pack(real out[N], const complex in[N/2], int idx) const
typename mapper< Float >::type real
This is just a dummy structure we use for trove to define the required structure size.
__host__ __device__ real & operator[](int i)
__host__ __device__ const real & operator[](int i) const
struct to define TIFR ordered gauge fields: [mu][parity][volumecb][col][row]
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
TIFROrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
TIFROrder(const TIFROrder &order)
typename mapper< Float >::type real
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
__device__ __host__ void save(const complex v[9], int x, int dir, int parity)
__device__ __host__ void load(complex v[9], int x, int dir, int parity, real inphase=1.0) const
TIFRPaddedOrder(const TIFRPaddedOrder &order)
__device__ __host__ void save(const complex v[9], int x, int dir, int parity)
__device__ __host__ const gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity) const
This accessor routine returns a const gauge_wrapper to this object, allowing us to overload various o...
TIFRPaddedOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
__device__ __host__ int getPaddedIndex(int x_cb, int parity) const
Compute the index into the padded field. Assumes that parity doesn't change from unpadded to padded.
typename mapper< Float >::type real
__device__ __host__ gauge_wrapper< real, Accessor > operator()(int dim, int x_cb, int parity)
This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operato...
__device__ __host__ void load(complex v[9], int x, int dir, int parity, real inphase=1.0) const
__host__ __device__ Float operator()(const quda::complex< int8_t > &x)
__host__ __device__ Float operator()(const quda::complex< int > &x)
__host__ __device__ Float operator()(const quda::complex< short > &x)
__host__ __device__ Float operator()(const quda::complex< storeFloat > &x)
fieldorder_wrapper is an internal class that is used to wrap instances of FieldOrder accessors,...
__device__ __host__ void operator=(const fieldorder_wrapper< Float, storeFloat > &a)
Assignment operator with fieldorder_wrapper instance as input.
static constexpr bool fixed
__device__ __host__ const auto data() const
__device__ __host__ void operator=(const complex< theirFloat > &a)
Assignment operator with complex number instance as input.
complex< storeFloat > * v
__device__ __host__ void operator+=(const complex< theirFloat > &a)
Operator+= with complex number instance as input.
__device__ __host__ Float real() const
__device__ __host__ void operator-=(const complex< theirFloat > &a)
Operator-= with complex number instance as input.
__device__ __host__ Float imag() const
__device__ __host__ fieldorder_wrapper(complex< storeFloat > *v, int idx, Float scale, Float scale_inv)
fieldorder_wrapper constructor
__device__ __host__ complex< Float > operator-() const
negation operator
__device__ __host__ auto data()
returns the pointor of this wrapper object
__host__ __device__ ReduceType operator()(const quda::complex< int8_t > &x)
square_(const ReduceType scale)
__host__ __device__ ReduceType operator()(const quda::complex< int > &x)
square_(const ReduceType scale)
__host__ __device__ ReduceType operator()(const quda::complex< short > &x)
square_(const ReduceType scale)
square_(ReduceType scale)
__host__ __device__ ReduceType operator()(const quda::complex< Float > &x)
gauge_ghost_wrapper is an internal class that is used to wrap instances of gauge ghost accessors,...
__device__ __host__ void operator=(const M &a)
Assignment operator with Matrix instance as input.
gauge::MILCOrder< T, N > type
gauge::QDPOrder< T, N > type
gauge::FloatNOrder< double, N, 2, 11, stag, huge_alloc, ghostExchange, use_inphase > type
gauge::FloatNOrder< double, N, 2, 12, stag, huge_alloc, ghostExchange, use_inphase > type
gauge::FloatNOrder< double, N, 2, 13, stag, huge_alloc, ghostExchange, use_inphase > type
gauge::FloatNOrder< double, N, 2, 8, stag, huge_alloc, ghostExchange, use_inphase > type
gauge::FloatNOrder< double, N, 2, 9, stag, huge_alloc, ghostExchange, use_inphase > type
gauge::FloatNOrder< double, N, 2, N, stag, huge_alloc, ghostExchange, use_inphase > type
gauge::FloatNOrder< float, N, 2, 11, stag, huge_alloc, ghostExchange, use_inphase > type
gauge::FloatNOrder< float, N, 4, 12, stag, huge_alloc, ghostExchange, use_inphase > type
gauge::FloatNOrder< float, N, 4, 13, stag, huge_alloc, ghostExchange, use_inphase > type
gauge::FloatNOrder< float, N, 4, 8, stag, huge_alloc, ghostExchange, use_inphase > type
gauge::FloatNOrder< float, N, 4, 9, stag, huge_alloc, ghostExchange, use_inphase > type
gauge::FloatNOrder< float, N, 2, N, stag, huge_alloc, ghostExchange, use_inphase > type
gauge::FloatNOrder< int8_t, N, 2, 11, stag, huge_alloc, ghostExchange, use_inphase > type
gauge::FloatNOrder< int8_t, N, 4, 12, stag, huge_alloc, ghostExchange, use_inphase > type
gauge::FloatNOrder< int8_t, N, 4, 13, stag, huge_alloc, ghostExchange, use_inphase > type
gauge::FloatNOrder< int8_t, N, N8, 8, stag, huge_alloc, ghostExchange, use_inphase > type
gauge::FloatNOrder< int8_t, N, N8, 9, stag, huge_alloc, ghostExchange, use_inphase > type
gauge::FloatNOrder< int8_t, N, 2, N, stag, huge_alloc, ghostExchange, use_inphase > type
gauge::FloatNOrder< short, N, 2, 11, stag, huge_alloc, ghostExchange, use_inphase > type
gauge::FloatNOrder< short, N, 4, 12, stag, huge_alloc, ghostExchange, use_inphase > type
gauge::FloatNOrder< short, N, 4, 13, stag, huge_alloc, ghostExchange, use_inphase > type
gauge::FloatNOrder< short, N, N8, 8, stag, huge_alloc, ghostExchange, use_inphase > type
gauge::FloatNOrder< short, N, N8, 9, stag, huge_alloc, ghostExchange, use_inphase > type
gauge::FloatNOrder< short, N, 2, N, stag, huge_alloc, ghostExchange, use_inphase > type
gauge::BQCDOrder< T, 2 *Nc *Nc > type
gauge::CPSOrder< T, 2 *Nc *Nc > type
gauge::FloatNOrder< T, 2 *Nc *Nc, 2, 2 *Nc *Nc > type
gauge::MILCOrder< T, 2 *Nc *Nc > type
gauge::QDPOrder< T, 2 *Nc *Nc > type
gauge::QDPJITOrder< T, 2 *Nc *Nc > type
gauge::TIFROrder< T, 2 *Nc *Nc > type
gauge::TIFRPaddedOrder< T, 2 *Nc *Nc > type
gauge_wrapper is an internal class that is used to wrap instances of gauge accessors,...
__device__ __host__ void operator=(const M &a)
Assignment operator with Matrix instance as input.