8 template <
typename Float>
10 a[0] += sign*(b[0]*c[0] - b[1]*c[1]);
11 a[1] += sign*(b[0]*c[1] + b[1]*c[0]);
15 template <
typename Float>
17 a[0] = b[0]*c[0] - b[1]*c[1];
18 a[1] = b[0]*c[1] + b[1]*c[0];
22 template <
typename Float>
24 a[0] = b[0]*c[0] + b[1]*c[1];
25 a[1] = b[0]*c[1] - b[1]*c[0];
30 template <
typename Float>
33 Float denom = c[0]*c[0] + c[1]*c[1];
39 template <
typename Float>
41 a[0] += sign * (b[0]*c[0] - b[1]*c[1]);
42 a[1] -= sign * (b[0]*c[1] + b[1]*c[0]);
46 template <
typename Float>
48 a[0] = b[0]*c[0] - b[1]*c[1];
49 a[1] = -b[0]*c[1] - b[1]*c[0];
53 template <
int N,
typename Float>
59 for (
int i=0; i<N; i++)
out[i] =
in[i];
62 for (
int i=0; i<N; i++)
out[i] =
in[i];
74 template <
typename Float>
81 for (
int i=0; i<18; i++) out[i] = in[i] / scale;
85 for (
int i=0; i<18; i++) out[i] = scale * in[i];
91 template <
typename Float>
93 bool isFirstTimeSlice,
bool isLastTimeSlice) {
106 template <
typename Float>
108 QudaTboundary tBoundary,
bool isFirstTimeSlice,
bool isLastTimeSlice,
111 if ( idx >=
X[3]*
X[2]*
X[1]*
X[0]/2 ) {
112 return isFirstTimeSlice ? tBoundary : 1.0;
113 }
else if ( idx >= (
X[3]-1)*
X[0]*
X[1]*
X[2]/2 ) {
114 return isLastTimeSlice ? tBoundary : 1.0;
119 if ( idx >= (R[3]-1)*
X[0]*
X[1]*
X[2]/2 && idx < R[3]*
X[0]*
X[1]*
X[2]/2 ) {
121 return isFirstTimeSlice ? tBoundary : 1.0;
122 }
else if ( idx >= (
X[3]-R[3]-1)*
X[0]*
X[1]*
X[2]/2 && idx < (
X[3]-R[3])*
X[0]*
X[1]*
X[2]/2 ) {
124 return isLastTimeSlice ? tBoundary : 1.0;
131 template <
typename Float>
143 isFirstTimeSlice(
comm_coord(3) == 0 ?true : false),
145 ghostExchange(u.GhostExchange()) {
153 for (
int i=0; i<12; i++) out[i] = in[i];
157 int idx,
int dir,
const RegType phase)
const {
158 for (
int i=0; i<12; i++) out[i] = in[i];
159 for (
int i=12; i<18; i++) out[i] = 0.0;
168 timeBoundary<RegType>(
idx,
X, R, tBoundary,isFirstTimeSlice, isLastTimeSlice, ghostExchange);
170 for (
int i=12; i<18; i++) out[i]*=u0;
179 template <
typename Float>
186 for (
int i=0; i<4; i++) out[i] = in[i+2];
196 int idx,
int dir,
const RegType phase)
const {
199 for (
int i=0; i<4; i++) out[i+2] = in[i];
215 { *phase=0;
return; }
219 template <
typename Float>
228 reconstruct_12.Pack(out, in, idx);
232 for(
int i=0; i<12; ++i) out[i] = in[i];
233 for(
int i=12; i<18; ++i) out[i] = 0.0;
248 complexProduct(tmp, cos_sin, &out[12]); out[12] = tmp[0]; out[13] = tmp[1];
249 complexProduct(tmp, cos_sin, &out[14]); out[14] = tmp[0]; out[15] = tmp[1];
250 complexProduct(tmp, cos_sin, &out[16]); out[16] = tmp[0]; out[17] = tmp[1];
260 denom[1] /= (-scale);
273 template <
typename Float>
285 isFirstTimeSlice(
comm_coord(3) == 0 ? true : false),
287 ghostExchange(u.GhostExchange()) {
297 for (
int i=2; i<8; i++) out[i] = in[i];
301 int idx,
int dir,
const RegType phase)
const {
304 for (
int i=2; i<6; i++) {
306 row_sum += in[i]*in[i];
310 timeBoundary<RegType>(
idx,
X, R, tBoundary,isFirstTimeSlice, isLastTimeSlice, ghostExchange);
312 RegType diff = 1.0/(u0*u0) - row_sum;
320 for (
int i=0; i<2; i++) column_sum += out[i]*out[i];
321 for (
int i=6; i<8; i++) {
323 column_sum += in[i]*in[i];
325 diff = 1.f/(u0*u0) - column_sum;
333 RegType r_inv2 = 1.0/(u0*row_sum);
367 template <
typename Float>
383 denom[1] /= (-scale);
398 sincos(-phase, &cos_sin[1], &cos_sin[0]);
404 for(
int i=0; i<4; ++i){
408 reconstruct_8.Pack(out, su3, idx);
412 reconstruct_8.Unpack(out, in, idx, dir, phase);
420 complexProduct(tmp, cos_sin, &out[0]); out[0] = tmp[0]; out[1] = tmp[1];
421 complexProduct(tmp, cos_sin, &out[2]); out[2] = tmp[0]; out[3] = tmp[1];
422 complexProduct(tmp, cos_sin, &out[4]); out[4] = tmp[0]; out[5] = tmp[1];
423 complexProduct(tmp, cos_sin, &out[6]); out[6] = tmp[0]; out[7] = tmp[1];
424 complexProduct(tmp, cos_sin, &out[8]); out[8] = tmp[0]; out[9] = tmp[1];
425 complexProduct(tmp, cos_sin, &out[10]); out[10] = tmp[0]; out[11] = tmp[1];
426 complexProduct(tmp, cos_sin, &out[12]); out[12] = tmp[0]; out[13] = tmp[1];
427 complexProduct(tmp, cos_sin, &out[14]); out[14] = tmp[0]; out[15] = tmp[1];
428 complexProduct(tmp, cos_sin, &out[16]); out[16] = tmp[0]; out[17] = tmp[1];
434 template <
typename Float,
int length,
int N,
int reconLen>
444 #if __COMPUTE_CAPABILITY__ >= 200
446 const size_t phaseOffset;
451 #
if __COMPUTE_CAPABILITY__ >= 200
453 phaseOffset(u.PhaseOffset())
459 for (
int i=0; i<4; i++) {
460 ghost[i] = ghost_ ? ghost_[i] : 0;
469 #
if __COMPUTE_CAPABILITY__ >= 200
470 , hasPhase(order.hasPhase), phaseOffset(order.phaseOffset)
473 gauge[0] = order.gauge[0];
474 gauge[1] = order.gauge[1];
475 for (
int i=0; i<4; i++) {
476 ghost[i] = order.ghost[i];
483 const int M = reconLen / N;
485 for (
int i=0; i<M; i++) {
486 for (
int j=0; j<N; j++) {
487 int intIdx = i*N + j;
488 int padIdx = intIdx / N;
493 #if __COMPUTE_CAPABILITY__ >= 200
501 const int M = reconLen / N;
504 for (
int i=0; i<M; i++) {
505 for (
int j=0; j<N; j++) {
506 int intIdx = i*N + j;
507 int padIdx = intIdx / N;
511 #if __COMPUTE_CAPABILITY__ >= 200
515 copy(
gauge[parity][phaseOffset/
sizeof(
Float) + dir*
stride + x], static_cast<RegType>(phase/(2.*M_PI)));
525 const int M = reconLen / N;
527 for (
int i=0; i<M; i++) {
528 for (
int j=0; j<N; j++) {
529 int intIdx = i*N + j;
530 int padIdx = intIdx / N;
531 #if __COMPUTE_CAPABILITY__ < 200
532 const int hasPhase = 0;
538 #if __COMPUTE_CAPABILITY__ >= 200
549 const int M = reconLen / N;
552 for (
int i=0; i<M; i++) {
553 for (
int j=0; j<N; j++) {
554 int intIdx = i*N + j;
555 int padIdx = intIdx / N;
556 #if __COMPUTE_CAPABILITY__ < 200
557 const int hasPhase = 0;
563 #if __COMPUTE_CAPABILITY__ >= 200
574 int dim,
int g,
int parity,
const int R[])
const {
575 #if __COMPUTE_CAPABILITY__ < 200
576 const int hasPhase = 0;
578 const int M = reconLen / N;
580 for (
int i=0; i<M; i++) {
581 for (
int j=0; j<N; j++) {
582 int intIdx = i*N + j;
583 int padIdx = intIdx / N;
585 + (padIdx*R[dim]*
faceVolumeCB[dim]+buff_idx)*N + intIdx%N]);
597 int dir,
int dim,
int g,
int parity,
const int R[]) {
598 #if __COMPUTE_CAPABILITY__ < 200
599 const int hasPhase = 0;
601 const int M = reconLen / N;
605 for (
int i=0; i<M; i++) {
606 for (
int j=0; j<N; j++) {
607 int intIdx = i*N + j;
608 int padIdx = intIdx / N;
610 + (padIdx*R[dim]*
faceVolumeCB[dim]+buff_idx)*N + intIdx%N], tmp[i*N+j]);
617 static_cast<RegType>(phase/(2.*M_PI)));
628 template <
typename Float,
int length>
640 for (
int i=0; i<4; i++) {
648 for (
int i=0; i<4; i++) {
665 int dim,
int g,
int parity,
const int R[])
const {
666 for (
int i=0; i<
length; i++) {
673 int dir,
int dim,
int g,
int parity,
const int R[]) {
674 for (
int i=0; i<
length; i++) {
686 template <
typename Float,
int length>
struct QDPOrder :
public LegacyOrder<Float,length> {
694 for(
int i=0; i<4; i++)
gauge[i] = order.gauge[i];
699 for (
int i=0; i<
length; i++) {
705 for (
int i=0; i<
length; i++) {
717 template <
typename Float,
int length>
struct QDPJITOrder :
public LegacyOrder<Float,length> {
725 for(
int i=0; i<4; i++)
gauge[i] = order.gauge[i];
730 for (
int i=0; i<
length; i++) {
738 for (
int i=0; i<
length; i++) {
766 for (
int i=0; i<
length; i++) {
772 for (
int i=0; i<
length; i++) {
803 for (
int i=0; i<
Nc; i++) {
804 for (
int j=0; j<
Nc; j++) {
805 for (
int z=0; z<2; z++) {
814 for (
int i=0; i<
Nc; i++) {
815 for (
int j=0; j<
Nc; j++) {
816 for (
int z=0; z<2; z++) {
831 template <
typename Float,
int length>
struct BQCDOrder : LegacyOrder<Float,length> {
842 for (
int i=1; i<4; i++)
exVolumeCB *= u.
X()[i] + 2;
853 for (
int i=0; i<
Nc; i++) {
854 for (
int j=0; j<
Nc; j++) {
855 for (
int z=0; z<2; z++) {
863 for (
int i=0; i<
Nc; i++) {
864 for (
int j=0; j<
Nc; j++) {
865 for (
int z=0; z<2; z++) {
899 for (
int i=0; i<
Nc; i++) {
900 for (
int j=0; j<
Nc; j++) {
901 for (
int z=0; z<2; z++) {
909 for (
int i=0; i<
Nc; i++) {
910 for (
int j=0; j<
Nc; j++) {
911 for (
int z=0; z<2; z++) {
QDPOrder(const QDPOrder &order)
const QudaTboundary tBoundary
__device__ __host__ void Unpack(RegType out[18], const RegType in[12], int idx, int dir, const RegType phase) const
mapper< Float >::type RegType
const QudaTboundary tBoundary
const void ** Ghost() const
Float * ghost[QUDA_MAX_DIM]
__device__ __host__ void loadGhostEx(RegType v[length], int buff_idx, int extended_idx, int dir, int dim, int g, int parity, const int R[]) const
__device__ __host__ void save(const RegType v[18], int x, int dir, int parity)
Float * gauge[QUDA_MAX_DIM]
__device__ __host__ void loadGhost(RegType v[length], int x, int dir, int parity) const
Reconstruct(const GaugeField &u)
Float * gauge[QUDA_MAX_DIM]
__device__ __host__ void accumulateConjugateProduct(Float *a, const Float *b, const Float *c, int sign)
__device__ __host__ void save(const RegType v[length], int x, int parity)
__device__ __host__ void complexProduct(Float *a, const Float *b, const Float *c)
mapper< Float >::type RegType
Reconstruct(const GaugeField &u)
__device__ __host__ void load(RegType v[length], int x, int dir, int parity) const
__device__ __host__ void complexQuotient(Float *a, const Float *b, const Float *c)
__device__ __host__ void save(const RegType v[length], int x, int dir, int parity)
__host__ __device__ ValueType sqrt(ValueType x)
__device__ __host__ void getPhase(RegType *phase, const RegType in[18])
__device__ __host__ void getPhase(RegType *phase, const RegType in[18])
__device__ __host__ void getPhase(RegType *phase, const RegType in[18]) const
__device__ __host__ void Unpack(RegType out[18], const RegType in[12], int idx, int dir, const RegType phase) const
__device__ __host__ void Unpack(RegType out[18], const RegType in[8], int idx, int dir, const RegType phase) const
__host__ __device__ void copy(T1 &a, const T2 &b)
__device__ __host__ void saveGhostEx(const RegType v[length], int buff_idx, int extended_idx, int dir, int dim, int g, int parity, const int R[])
const Reconstruct< 8, Float > reconstruct_8
__device__ __host__ void save(const RegType v[length], int x, int dir, int parity)
__device__ __host__ void Pack(RegType out[10], const RegType in[18], int idx) const
QudaGhostExchange ghostExchange
BQCDOrder(const BQCDOrder &order)
__device__ __host__ void Pack(RegType out[12], const RegType in[18], int idx) const
enum QudaTboundary_s QudaTboundary
__device__ __host__ void Unpack(RegType out[N], const RegType in[N], int idx, int dir, const RegType phase) const
const int * SurfaceCB() const
QDPJITOrder(const QDPJITOrder &order)
const Reconstruct< 12, Float > reconstruct_12
CPSOrder(const CPSOrder &order)
__device__ __host__ void load(RegType v[length], int x, int dir, int parity) const
__device__ __host__ void Pack(RegType out[N], const RegType in[N], int idx) const
__device__ __host__ Float timeBoundary(int idx, const int X[QUDA_MAX_DIM], QudaTboundary tBoundary, bool isFirstTimeSlice, bool isLastTimeSlice)
__device__ __host__ void Pack(RegType out[8], const RegType in[18], int idx) const
__device__ __host__ void Pack(RegType out[8], const RegType in[18], int idx) const
FloatNOrder(const FloatNOrder &order)
__device__ __host__ void getPhase(RegType *phase, const RegType in[18]) const
mapper< Float >::type RegType
__device__ __host__ void load(RegType v[18], int x, int dir, int parity) const
Reconstruct(const GaugeField &u)
cudaColorSpinorField * tmp
__device__ __host__ void loadGhost(RegType v[length], int x, int dir, int parity) const
TIFROrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
__device__ __host__ void load(RegType v[18], int x, int dir, int parity) const
__device__ __host__ void saveGhost(const RegType v[length], int x, int dir, int parity)
int faceVolumeCB[QUDA_MAX_DIM]
FloatingPoint< float > Float
__device__ __host__ void accumulateComplexProduct(Float *a, const Float *b, const Float *c, Float sign)
TIFROrder(const TIFROrder &order)
__device__ __host__ void save(const RegType v[18], int x, int dir, int parity)
__device__ __host__ void load(RegType v[length], int x, int dir, int parity) const
enum QudaGhostExchange_s QudaGhostExchange
mapper< Float >::type RegType
mapper< Float >::type RegType
MILCOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
__constant__ double anisotropy
__constant__ double coeff
mapper< Float >::type RegType
mapper< Float >::type RegType
BQCDOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
__device__ __host__ void getPhase(RegType *phase, const RegType in[18])
__device__ __host__ void Unpack(RegType out[18], const RegType in[8], int idx, int dir, const RegType phase) const
__device__ __host__ void complexDotProduct(Float *a, const Float *b, const Float *c)
__device__ __host__ void load(RegType v[18], int x, int dir, int parity) const
CPSOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
FloatNOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
__device__ __host__ void saveGhost(const RegType v[length], int x, int dir, int parity)
mapper< Float >::type RegType
__device__ __host__ void complexConjugateProduct(Float *a, const Float *b, const Float *c)
LegacyOrder(const GaugeField &u, Float **ghost_)
MILCOrder(const MILCOrder &order)
cpuColorSpinorField * out
QudaGhostExchange ghostExchange
__device__ __host__ void Pack(RegType out[12], const RegType in[18], int idx) const
Reconstruct< reconLen, Float > reconstruct
__device__ __host__ void load(RegType v[length], int x, int parity) const
__device__ __host__ void Pack(RegType out[18], const RegType in[18], int idx) const
__device__ __host__ void save(const RegType v[length], int x, int dir, int parity)
__device__ __host__ void loadGhostEx(RegType v[length], int x, int dummy, int dir, int dim, int g, int parity, const int R[]) const
Reconstruct(const GaugeField &u)
__device__ __host__ void Unpack(RegType out[18], const RegType in[10], int idx, int dir, const RegType phase) const
QDPJITOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
LegacyOrder(const LegacyOrder &order)
mapper< Float >::type RegType
mapper< Float >::type RegType
__device__ __host__ void load(RegType v[length], int x, int dir, int parity) const
__device__ __host__ void getPhase(RegType *phase, const RegType in[18]) const
#define QUDA_MAX_DIM
Maximum number of dimensions supported by QUDA. In practice, no routines make use of more than 5...
mapper< Float >::type RegType
__device__ __host__ void saveGhostEx(const RegType v[length], int x, int dummy, int dir, int dim, int g, int parity, const int R[])
__device__ __host__ void save(const RegType v[length], int x, int dir, int parity)
__device__ __host__ void Unpack(RegType out[18], const RegType in[18], int idx, int dir, const RegType phase) const
mapper< Float >::type RegType
mapper< Float >::type RegType
QDPOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0)
mapper< Float >::type RegType
mapper< Float >::type RegType
__device__ __host__ void getPhase(RegType *phase, const RegType in[18]) const
Reconstruct(const GaugeField &u)
__device__ __host__ void save(const RegType v[18], int x, int dir, int parity)
Reconstruct(const GaugeField &u)
Reconstruct(const GaugeField &u)