23 #define GAUGEFIXING_SITE_MATRIX_LOAD_TEX 26 #define GAUGEFIXING_DONT_USE_GX 34 #ifdef GAUGEFIXING_DONT_USE_GX 35 #warning Not using precalculated g(x) 37 #warning Using precalculated g(x) 42 #ifndef FL_UNITARIZE_PI 43 #define FL_UNITARIZE_PI 3.14159265358979323846 47 texture<float2, 1, cudaReadModeElementType> GXTexSingle;
48 texture<int4, 1, cudaReadModeElementType> GXTexDouble;
52 texture<float2, 1, cudaReadModeElementType> DELTATexSingle;
53 texture<int4, 1, cudaReadModeElementType> DELTATexDouble;
57 inline __device__ T TEXTURE_GX(
int id){
62 return tex1Dfetch(GXTexSingle,
id);
66 int4 u = tex1Dfetch(GXTexDouble,
id);
67 return complex<double>(__hiloint2double(u.y, u.x), __hiloint2double(u.w, u.z));
70 inline __device__ T TEXTURE_DELTA(
int id){
75 return tex1Dfetch(DELTATexSingle,
id);
79 int4 u = tex1Dfetch(DELTATexDouble,
id);
80 return complex<double>(__hiloint2double(u.y, u.x), __hiloint2double(u.w, u.z));
84 #ifdef GAUGEFIXING_SITE_MATRIX_LOAD_TEX 85 #ifndef GAUGEFIXING_DONT_USE_GX 86 cudaBindTexture(0, GXTexSingle, gx,
bytes);
88 cudaBindTexture(0, DELTATexSingle,
delta,
bytes);
93 #ifdef GAUGEFIXING_SITE_MATRIX_LOAD_TEX 94 #ifndef GAUGEFIXING_DONT_USE_GX 95 cudaBindTexture(0, GXTexDouble, gx,
bytes);
97 cudaBindTexture(0, DELTATexDouble,
delta,
bytes);
102 #ifdef GAUGEFIXING_SITE_MATRIX_LOAD_TEX 103 #ifndef GAUGEFIXING_DONT_USE_GX 104 cudaUnbindTexture(GXTexSingle);
106 cudaUnbindTexture(DELTATexSingle);
111 #ifdef GAUGEFIXING_SITE_MATRIX_LOAD_TEX 112 #ifndef GAUGEFIXING_DONT_USE_GX 113 cudaUnbindTexture(GXTexDouble);
115 cudaUnbindTexture(DELTATexDouble);
120 template <
typename Float>
121 struct GaugeFixFFTRotateArg {
124 complex<Float> *
tmp0;
125 complex<Float> *
tmp1;
127 for (
int dir = 0; dir < 4; ++dir )
X[dir] = data.
X()[dir];
128 threads =
X[0] *
X[1] *
X[2] *
X[3];
136 template <
int direction,
typename Float>
137 __global__
void fft_rotate_kernel_2D2D(GaugeFixFFTRotateArg<Float>
arg){
138 int id = blockIdx.x *
blockDim.x + threadIdx.x;
139 if (
id >=
arg.threads )
return;
140 if ( direction == 0 ) {
141 int x3 =
id / (
arg.X[0] *
arg.X[1] *
arg.X[2]);
142 int x2 = (
id / (
arg.X[0] *
arg.X[1])) %
arg.X[2];
143 int x1 = (
id /
arg.X[0]) %
arg.X[1];
144 int x0 =
id %
arg.X[0];
146 int id = x0 + (x1 + (x2 + x3 *
arg.X[2]) *
arg.X[1]) *
arg.X[0];
147 int id_out = x2 + (x3 + (x0 + x1 *
arg.X[0]) *
arg.X[3]) *
arg.X[2];
148 arg.tmp1[id_out] =
arg.tmp0[
id];
151 if ( direction == 1 ) {
153 int x1 =
id / (
arg.X[2] *
arg.X[3] *
arg.X[0]);
154 int x0 = (
id / (
arg.X[2] *
arg.X[3])) %
arg.X[0];
155 int x3 = (
id /
arg.X[2]) %
arg.X[3];
156 int x2 =
id %
arg.X[2];
158 int id = x2 + (x3 + (x0 + x1 *
arg.X[0]) *
arg.X[3]) *
arg.X[2];
159 int id_out = x0 + (x1 + (x2 + x3 *
arg.X[2]) *
arg.X[1]) *
arg.X[0];
160 arg.tmp1[id_out] =
arg.tmp0[
id];
170 template<
typename Float>
171 class GaugeFixFFTRotate :
Tunable {
172 GaugeFixFFTRotateArg<Float>
arg;
174 mutable char aux_string[128];
176 unsigned int sharedBytesPerThread()
const {
183 bool tuneGridDim()
const {
186 unsigned int minThreads()
const {
191 GaugeFixFFTRotate(GaugeFixFFTRotateArg<Float> &
arg) :
arg(
arg) {
194 ~GaugeFixFFTRotate () {
196 void setDirection(
int dir, complex<Float> *data_in, complex<Float> *data_out){
202 void apply(
const cudaStream_t &
stream){
204 if ( direction == 0 )
206 else if ( direction == 1 )
209 errorQuda(
"Error in GaugeFixFFTRotate option.\n");
213 std::stringstream vol;
214 vol <<
arg.X[0] <<
"x";
215 vol <<
arg.X[1] <<
"x";
216 vol <<
arg.X[2] <<
"x";
218 sprintf(aux_string,
"threads=%d,prec=%lu",
arg.threads,
sizeof(Float));
219 return TuneKey(vol.str().c_str(),
typeid(*this).name(), aux_string);
223 long long flops()
const {
226 long long bytes()
const {
227 return 4LL *
sizeof(Float) *
arg.threads;
233 template <
typename Float,
typename Gauge>
234 struct GaugeFixQualityArg :
public ReduceArg<double2> {
238 complex<Float> *
delta;
240 GaugeFixQualityArg(
const Gauge &dataOr,
const cudaGaugeField &data, complex<Float> *
delta)
242 for (
int dir = 0; dir < 4; ++dir )
X[dir] = data.
X()[dir];
245 double getAction(){
return result_h[0].x; }
246 double getTheta(){
return result_h[0].y; }
251 template<
int blockSize,
int Elems,
typename Float,
typename Gauge,
int gauge_dir>
252 __global__
void computeFix_quality(GaugeFixQualityArg<Float, Gauge> argQ){
256 double2 data = make_double2(0.0,0.0);
257 if (
idx < argQ.threads ) {
258 typedef complex<Float> Cmplx;
265 for (
int mu = 0;
mu < gauge_dir;
mu++ ) {
273 for (
int mu = 0;
mu < gauge_dir;
mu++ ) {
286 argQ.delta[
idx + 2 * argQ.threads] =
delta(0,1);
287 argQ.delta[
idx + 4 * argQ.threads] =
delta(0,2);
288 argQ.delta[
idx + 6 * argQ.threads] =
delta(1,1);
289 argQ.delta[
idx + 8 * argQ.threads] =
delta(1,2);
290 argQ.delta[
idx + 10 * argQ.threads] =
delta(2,2);
297 reduce2d<blockSize,2>(argQ, data);
302 template<
int Elems,
typename Float,
typename Gauge,
int gauge_dir>
304 GaugeFixQualityArg<Float, Gauge> argQ;
305 mutable char aux_string[128];
308 unsigned int minThreads()
const {
return argQ.threads; }
311 GaugeFixQuality(GaugeFixQualityArg<Float, Gauge> &argQ)
314 ~GaugeFixQuality () { }
316 void apply(
const cudaStream_t &
stream){
318 argQ.result_h[0] = make_double2(0.0,0.0);
321 argQ.result_h[0].x /= (
double)(3 * gauge_dir * 2 * argQ.threads);
322 argQ.result_h[0].y /= (
double)(3 * 2 * argQ.threads);
326 std::stringstream vol;
327 vol << argQ.X[0] <<
"x" << argQ.X[1] <<
"x" << argQ.X[2] <<
"x" << argQ.X[3];
328 sprintf(aux_string,
"threads=%d,prec=%lu,gaugedir=%d", argQ.threads,
sizeof(Float), gauge_dir);
329 return TuneKey(vol.str().c_str(),
typeid(*this).name(), aux_string);
332 long long flops()
const {
333 return (36LL * gauge_dir + 65LL) * 2 * argQ.threads;
335 long long bytes()
const {
336 return (2LL * gauge_dir + 2LL) * Elems * 2 * argQ.threads *
sizeof(Float);
343 template <
typename Float>
349 complex<Float> *
delta;
352 GaugeFixArg(
cudaGaugeField & data,
const int Elems) : data(data){
353 for (
int dir = 0; dir < 4; ++dir )
X[dir] = data.
X()[dir];
354 threads =
X[0] *
X[1] *
X[2] *
X[3];
357 #ifdef GAUGEFIXING_DONT_USE_GX 358 gx = (complex<Float>*)
device_malloc(
sizeof(complex<Float>) * threads);
360 gx = (complex<Float>*)
device_malloc(
sizeof(complex<Float>) * threads * Elems);
362 BindTex(
delta, gx,
sizeof(complex<Float>) * threads * Elems);
365 UnBindTex(
delta, gx);
375 template <
typename Float>
376 __global__
void kernel_gauge_set_invpsq(GaugeFixArg<Float>
arg){
377 int id = blockIdx.x *
blockDim.x + threadIdx.x;
378 if (
id >=
arg.threads )
return;
379 int x1 =
id / (
arg.X[2] *
arg.X[3] *
arg.X[0]);
380 int x0 = (
id / (
arg.X[2] *
arg.X[3])) %
arg.X[0];
381 int x3 = (
id /
arg.X[2]) %
arg.X[3];
382 int x2 =
id %
arg.X[2];
384 Float sx =
sin( (Float)x0 * FL_UNITARIZE_PI / (Float)
arg.X[0]);
385 Float sy =
sin( (Float)x1 * FL_UNITARIZE_PI / (Float)
arg.X[1]);
386 Float sz =
sin( (Float)x2 * FL_UNITARIZE_PI / (Float)
arg.X[2]);
387 Float st =
sin( (Float)x3 * FL_UNITARIZE_PI / (Float)
arg.X[3]);
388 Float sinsq = sx * sx + sy * sy + sz * sz + st * st;
391 if ( sinsq > 0.00001 ) prcfact = 4.0 / (sinsq * (Float)
arg.threads);
392 arg.invpsq[
id] = prcfact;
396 template<
typename Float>
397 class GaugeFixSETINVPSP :
Tunable {
398 GaugeFixArg<Float>
arg;
399 mutable char aux_string[128];
401 unsigned int sharedBytesPerThread()
const {
407 bool tuneSharedBytes()
const {
410 bool tuneGridDim()
const {
413 unsigned int minThreads()
const {
418 GaugeFixSETINVPSP(GaugeFixArg<Float> &
arg) :
arg(
arg) { }
419 ~GaugeFixSETINVPSP () { }
421 void apply(
const cudaStream_t &
stream){
427 std::stringstream vol;
428 vol <<
arg.X[0] <<
"x";
429 vol <<
arg.X[1] <<
"x";
430 vol <<
arg.X[2] <<
"x";
432 sprintf(aux_string,
"threads=%d,prec=%lu",
arg.threads,
sizeof(Float));
433 return TuneKey(vol.str().c_str(),
typeid(*this).name(), aux_string);
437 long long flops()
const {
438 return 21 *
arg.threads;
440 long long bytes()
const {
441 return sizeof(Float) *
arg.threads;
446 template<
typename Float>
447 __global__
void kernel_gauge_mult_norm_2D(GaugeFixArg<Float>
arg){
448 int id = blockIdx.x *
blockDim.x + threadIdx.x;
453 template<
typename Float>
454 class GaugeFixINVPSP :
Tunable {
455 GaugeFixArg<Float>
arg;
456 mutable char aux_string[128];
458 unsigned int sharedBytesPerThread()
const {
465 bool tuneGridDim()
const {
468 unsigned int minThreads()
const {
473 GaugeFixINVPSP(GaugeFixArg<Float> &
arg)
475 cudaFuncSetCacheConfig( kernel_gauge_mult_norm_2D<Float>, cudaFuncCachePreferL1);
480 void apply(
const cudaStream_t &
stream){
486 std::stringstream vol;
487 vol <<
arg.X[0] <<
"x";
488 vol <<
arg.X[1] <<
"x";
489 vol <<
arg.X[2] <<
"x";
491 sprintf(aux_string,
"threads=%d,prec=%lu",
arg.threads,
sizeof(Float));
492 return TuneKey(vol.str().c_str(),
typeid(*this).name(), aux_string);
498 complex<Float> *
tmp =
arg.gx;
505 long long flops()
const {
506 return 2LL *
arg.threads;
508 long long bytes()
const {
509 return 5LL *
sizeof(Float) *
arg.threads;
516 template <
typename Float>
517 __host__ __device__
inline void reunit_link(
Matrix<complex<Float>,3> &U ){
519 complex<Float> t2((Float)0.0, (Float)0.0);
524 for (
int c = 0;
c < 3;
c++ ) t1 +=
norm(U(0,
c));
525 t1 = (Float)1.0 /
sqrt(t1);
529 for (
int c = 0;
c < 3;
c++ ) U(0,
c) *= t1;
532 for (
int c = 0;
c < 3;
c++ ) t2 +=
conj(U(0,
c)) * U(1,
c);
535 for (
int c = 0;
c < 3;
c++ ) U(1,
c) -= t2 * U(0,
c);
541 for (
int c = 0;
c < 3;
c++ ) t1 +=
norm(U(1,
c));
542 t1 = (Float)1.0 /
sqrt(t1);
546 for (
int c = 0;
c < 3;
c++ ) U(1,
c) *= t1;
549 U(2,0) =
conj(U(0,1) * U(1,2) - U(0,2) * U(1,1));
550 U(2,1) =
conj(U(0,2) * U(1,0) - U(0,0) * U(1,2));
551 U(2,2) =
conj(U(0,0) * U(1,1) - U(0,1) * U(1,0));
556 #ifdef GAUGEFIXING_DONT_USE_GX 558 template <
typename Float,
typename Gauge>
559 __global__
void kernel_gauge_fix_U_EO_NEW( GaugeFixArg<Float>
arg, Gauge dataOr, Float half_alpha){
560 int id = threadIdx.x + blockIdx.x *
blockDim.x;
563 if (
id >=
arg.threads/2 )
return;
565 typedef complex<Float> Cmplx;
572 #ifdef GAUGEFIXING_SITE_MATRIX_LOAD_TEX 573 de(0,0) = TEXTURE_DELTA<Cmplx>(
idx + 0 *
arg.threads);
574 de(0,1) = TEXTURE_DELTA<Cmplx>(
idx + 1 *
arg.threads);
575 de(0,2) = TEXTURE_DELTA<Cmplx>(
idx + 2 *
arg.threads);
576 de(1,1) = TEXTURE_DELTA<Cmplx>(
idx + 3 *
arg.threads);
577 de(1,2) = TEXTURE_DELTA<Cmplx>(
idx + 4 *
arg.threads);
578 de(2,2) = TEXTURE_DELTA<Cmplx>(
idx + 5 *
arg.threads);
580 de(0,0) =
arg.delta[
idx + 0 *
arg.threads];
581 de(0,1) =
arg.delta[
idx + 1 *
arg.threads];
582 de(0,2) =
arg.delta[
idx + 2 *
arg.threads];
583 de(1,1) =
arg.delta[
idx + 3 *
arg.threads];
584 de(1,2) =
arg.delta[
idx + 4 *
arg.threads];
585 de(2,2) =
arg.delta[
idx + 5 *
arg.threads];
587 de(1,0) = Cmplx(-de(0,1).
x, de(0,1).
y);
588 de(2,0) = Cmplx(-de(0,2).
x, de(0,2).
y);
589 de(2,1) = Cmplx(-de(1,2).
x, de(1,2).
y);
592 g += de * half_alpha;
594 reunit_link<Float>( g );
598 for (
int mu = 0;
mu < 4;
mu++ ) {
606 #ifdef GAUGEFIXING_SITE_MATRIX_LOAD_TEX 607 de(0,0) = TEXTURE_DELTA<Cmplx>(
idx + 0 *
arg.threads);
608 de(0,1) = TEXTURE_DELTA<Cmplx>(
idx + 1 *
arg.threads);
609 de(0,2) = TEXTURE_DELTA<Cmplx>(
idx + 2 *
arg.threads);
610 de(1,1) = TEXTURE_DELTA<Cmplx>(
idx + 3 *
arg.threads);
611 de(1,2) = TEXTURE_DELTA<Cmplx>(
idx + 4 *
arg.threads);
612 de(2,2) = TEXTURE_DELTA<Cmplx>(
idx + 5 *
arg.threads);
614 de(0,0) =
arg.delta[
idx + 0 *
arg.threads];
615 de(0,1) =
arg.delta[
idx + 1 *
arg.threads];
616 de(0,2) =
arg.delta[
idx + 2 *
arg.threads];
617 de(1,1) =
arg.delta[
idx + 3 *
arg.threads];
618 de(1,2) =
arg.delta[
idx + 4 *
arg.threads];
619 de(2,2) =
arg.delta[
idx + 5 *
arg.threads];
621 de(1,0) = Cmplx(-de(0,1).
x, de(0,1).
y);
622 de(2,0) = Cmplx(-de(0,2).
x, de(0,2).
y);
623 de(2,1) = Cmplx(-de(1,2).
x, de(1,2).
y);
626 g0 += de * half_alpha;
628 reunit_link<Float>( g0 );
633 dataOr.save((Float*)(U.data),
id,
mu,
parity);
638 template<
typename Float,
typename Gauge>
640 GaugeFixArg<Float>
arg;
643 mutable char aux_string[128];
649 unsigned int minThreads()
const {
return arg.threads/2; }
652 GaugeFixNEW(Gauge & dataOr, GaugeFixArg<Float> &
arg, Float alpha)
653 : dataOr(dataOr),
arg(
arg) {
654 half_alpha = alpha * 0.5;
655 cudaFuncSetCacheConfig( kernel_gauge_fix_U_EO_NEW<Float, Gauge>, cudaFuncCachePreferL1);
659 void setAlpha(Float alpha){ half_alpha = alpha * 0.5; }
661 void apply(
const cudaStream_t &
stream){
663 kernel_gauge_fix_U_EO_NEW<Float, Gauge><< < tp.
grid, tp.
block, 0,
stream >> > (
arg, dataOr, half_alpha);
667 std::stringstream vol;
668 vol <<
arg.X[0] <<
"x" <<
arg.X[1] <<
"x" <<
arg.X[2] <<
"x" <<
arg.X[3];
669 sprintf(aux_string,
"threads=%d,prec=%lu",
arg.threads,
sizeof(Float));
670 return TuneKey(vol.str().c_str(),
typeid(*this).name(), aux_string);
681 long long flops()
const {
682 return 2414LL *
arg.threads;
685 long long bytes()
const {
686 return ( dataOr.Bytes() * 4LL + 5 * 12LL *
sizeof(Float)) *
arg.threads;
694 template <
int Elems,
typename Float>
695 __global__
void kernel_gauge_GX(GaugeFixArg<Float>
arg, Float half_alpha){
697 int id = blockIdx.x *
blockDim.x + threadIdx.x;
699 if (
id >=
arg.threads )
return;
701 typedef complex<Float> Cmplx;
705 #ifdef GAUGEFIXING_SITE_MATRIX_LOAD_TEX 706 de(0,0) = TEXTURE_DELTA<Cmplx>(
id);
707 de(0,1) = TEXTURE_DELTA<Cmplx>(
id +
arg.threads);
708 de(0,2) = TEXTURE_DELTA<Cmplx>(
id + 2 *
arg.threads);
709 de(1,1) = TEXTURE_DELTA<Cmplx>(
id + 3 *
arg.threads);
710 de(1,2) = TEXTURE_DELTA<Cmplx>(
id + 4 *
arg.threads);
711 de(2,2) = TEXTURE_DELTA<Cmplx>(
id + 5 *
arg.threads);
713 de(0,0) =
arg.delta[
id];
714 de(0,1) =
arg.delta[
id +
arg.threads];
715 de(0,2) =
arg.delta[
id + 2 *
arg.threads];
716 de(1,1) =
arg.delta[
id + 3 *
arg.threads];
717 de(1,2) =
arg.delta[
id + 4 *
arg.threads];
718 de(2,2) =
arg.delta[
id + 5 *
arg.threads];
720 de(1,0) = makeComplex(-de(0,1).
x, de(0,1).
y);
721 de(2,0) = makeComplex(-de(0,2).
x, de(0,2).
y);
722 de(2,1) = makeComplex(-de(1,2).
x, de(1,2).
y);
727 g += de * half_alpha;
729 reunit_link<Float>( g );
733 int x3 =
id / (
arg.X[0] *
arg.X[1] *
arg.X[2]);
734 int x2 = (
id / (
arg.X[0] *
arg.X[1])) %
arg.X[2];
735 int x1 = (
id /
arg.X[0]) %
arg.X[1];
736 int x0 =
id %
arg.X[0];
737 id = (x0 + (x1 + (x2 + x3 *
arg.X[2]) *
arg.X[1]) *
arg.X[0]) >> 1;
738 id += ((x0 + x1 + x2 + x3) & 1 ) *
arg.threads / 2;
740 for (
int i = 0;
i < Elems;
i++ )
arg.gx[
id +
i *
arg.threads] = g.
data[
i];
748 template<
int Elems,
typename Float>
750 GaugeFixArg<Float>
arg;
752 mutable char aux_string[128];
754 unsigned int sharedBytesPerThread()
const {
761 bool tuneGridDim()
const {
764 unsigned int minThreads()
const {
769 GaugeFix_GX(GaugeFixArg<Float> &
arg, Float alpha)
771 half_alpha = alpha * 0.5;
772 cudaFuncSetCacheConfig( kernel_gauge_GX<Elems, Float>, cudaFuncCachePreferL1);
777 void setAlpha(Float alpha){
778 half_alpha = alpha * 0.5;
782 void apply(
const cudaStream_t &
stream){
784 kernel_gauge_GX<Elems, Float><< < tp.
grid, tp.
block, 0,
stream >> > (
arg, half_alpha);
788 std::stringstream vol;
789 vol <<
arg.X[0] <<
"x";
790 vol <<
arg.X[1] <<
"x";
791 vol <<
arg.X[2] <<
"x";
793 sprintf(aux_string,
"threads=%d,prec=%lu",
arg.threads,
sizeof(Float));
794 return TuneKey(vol.str().c_str(),
typeid(*this).name(), aux_string);
798 long long flops()
const {
799 if ( Elems == 6 )
return 208LL *
arg.threads;
800 else return 166LL *
arg.threads;
802 long long bytes()
const {
803 return 4LL * Elems *
sizeof(Float) *
arg.threads;
809 template <
int Elems,
typename Float,
typename Gauge>
810 __global__
void kernel_gauge_fix_U_EO( GaugeFixArg<Float>
arg, Gauge dataOr){
811 int idd = threadIdx.x + blockIdx.x *
blockDim.x;
813 if ( idd >=
arg.threads )
return;
817 if ( idd >=
arg.threads / 2 ) {
819 id -=
arg.threads / 2;
821 typedef complex<Float> Cmplx;
825 for (
int i = 0;
i < Elems;
i++ ) {
826 #ifdef GAUGEFIXING_SITE_MATRIX_LOAD_TEX 827 g.
data[
i] = TEXTURE_GX<Cmplx>(idd +
i *
arg.threads);
833 g(2,0) =
conj(g(0,1) * g(1,2) - g(0,2) * g(1,1));
834 g(2,1) =
conj(g(0,2) * g(1,0) - g(0,0) * g(1,2));
835 g(2,2) =
conj(g(0,0) * g(1,1) - g(0,1) * g(1,0));
840 for (
int mu = 0;
mu < 4;
mu++ ) {
849 for (
int i = 0;
i < Elems;
i++ ) {
850 #ifdef GAUGEFIXING_SITE_MATRIX_LOAD_TEX 851 g0.
data[
i] = TEXTURE_GX<Cmplx>(idm1 +
i *
arg.threads);
857 g0(2,0) =
conj(g0(0,1) * g0(1,2) - g0(0,2) * g0(1,1));
858 g0(2,1) =
conj(g0(0,2) * g0(1,0) - g0(0,0) * g0(1,2));
859 g0(2,2) =
conj(g0(0,0) * g0(1,1) - g0(0,1) * g0(1,0));
864 dataOr.save((Float*)(U.data),
id,
mu,
parity);
872 template<
int Elems,
typename Float,
typename Gauge>
874 GaugeFixArg<Float>
arg;
876 mutable char aux_string[128];
878 unsigned int sharedBytesPerThread()
const {
885 bool tuneGridDim()
const {
888 unsigned int minThreads()
const {
893 GaugeFix(Gauge & dataOr, GaugeFixArg<Float> &
arg)
894 : dataOr(dataOr),
arg(
arg) {
895 cudaFuncSetCacheConfig( kernel_gauge_fix_U_EO<Elems, Float, Gauge>, cudaFuncCachePreferL1);
900 void apply(
const cudaStream_t &
stream){
902 kernel_gauge_fix_U_EO<Elems, Float, Gauge><< < tp.
grid, tp.
block, 0,
stream >> > (
arg, dataOr);
906 std::stringstream vol;
907 vol <<
arg.X[0] <<
"x";
908 vol <<
arg.X[1] <<
"x";
909 vol <<
arg.X[2] <<
"x";
911 sprintf(aux_string,
"threads=%d,prec=%lu",
arg.threads,
sizeof(Float));
912 return TuneKey(vol.str().c_str(),
typeid(*this).name(), aux_string);
923 long long flops()
const {
924 if ( Elems == 6 )
return 1794LL *
arg.threads;
925 else return 1536LL *
arg.threads;
928 long long bytes()
const {
929 return 26LL * Elems *
sizeof(Float) *
arg.threads;
937 template<
int Elems,
typename Float,
typename Gauge,
int gauge_dir>
939 const int Nsteps,
const int verbose_interval, \
940 const Float alpha0,
const int autotune,
const double tolerance, \
941 const int stopWtheta) {
943 TimeProfile profileInternalGaugeFixFFT(
"InternalGaugeFixQudaFFT",
false);
947 Float alpha = alpha0;
948 std::cout <<
"\tAlpha parameter of the Steepest Descent Method: " << alpha << std::endl;
949 if ( autotune ) std::cout <<
"\tAuto tune active: yes" << std::endl;
950 else std::cout <<
"\tAuto tune active: no" << std::endl;
951 std::cout <<
"\tStop criterium: " << tolerance << std::endl;
952 if ( stopWtheta ) std::cout <<
"\tStop criterium method: theta" << std::endl;
953 else std::cout <<
"\tStop criterium method: Delta" << std::endl;
954 std::cout <<
"\tMaximum number of iterations: " << Nsteps << std::endl;
955 std::cout <<
"\tPrint convergence results at every " << verbose_interval <<
" steps" << std::endl;
958 unsigned int delta_pad = data.
X()[0] * data.
X()[1] * data.
X()[2] * data.
X()[3];
959 int4
size = make_int4( data.
X()[0], data.
X()[1], data.
X()[2], data.
X()[3] );
963 GaugeFixArg<Float>
arg(data, Elems);
968 GaugeFixFFTRotateArg<Float> arg_rotate(data);
969 GaugeFixFFTRotate<Float> GFRotate(arg_rotate);
971 GaugeFixSETINVPSP<Float> setinvpsp(
arg);
973 GaugeFixINVPSP<Float> invpsp(
arg);
976 #ifdef GAUGEFIXING_DONT_USE_GX 978 GaugeFixNEW<Float, Gauge> gfixNew(dataOr,
arg, alpha);
981 GaugeFix_GX<Elems, Float> calcGX(
arg, alpha);
982 GaugeFix<Elems, Float, Gauge> gfix(dataOr,
arg);
985 GaugeFixQualityArg<Float, Gauge> argQ(dataOr, data,
arg.delta);
986 GaugeFixQuality<Elems, Float, Gauge, gauge_dir> gfixquality(argQ);
988 gfixquality.apply(0);
989 double action0 = argQ.getAction();
990 printf(
"Step: %d\tAction: %.16e\ttheta: %.16e\n", 0, argQ.getAction(), argQ.getTheta());
994 for ( iter = 0; iter < Nsteps; iter++ ) {
995 for (
int k = 0; k < 6; k++ ) {
1001 complex<Float> *_array =
arg.delta + k * delta_pad;
1010 GFRotate.setDirection(0,
arg.gx, _array);
1027 GFRotate.setDirection(1, _array,
arg.gx);
1034 #ifdef GAUGEFIXING_DONT_USE_GX 1052 gfixquality.apply(0);
1053 double action = argQ.getAction();
1054 diff =
abs(action0 - action);
1055 if ((iter % verbose_interval) == (verbose_interval - 1))
1056 printf(
"Step: %d\tAction: %.16e\ttheta: %.16e\tDelta: %.16e\n", iter + 1, argQ.getAction(), argQ.getTheta(), diff);
1057 if ( autotune && ((action - action0) < -1
e-14) ) {
1058 if ( alpha > 0.01 ) {
1059 alpha = 0.95 * alpha;
1060 #ifdef GAUGEFIXING_DONT_USE_GX 1061 gfixNew.setAlpha(alpha);
1063 calcGX.setAlpha(alpha);
1065 printf(
">>>>>>>>>>>>>> Warning: changing alpha down -> %.4e\n", alpha );
1071 if ( stopWtheta ) {
if ( argQ.getTheta() < tolerance )
break; }
1072 else {
if ( diff < tolerance )
break; }
1076 if ((iter % verbose_interval) != 0 )
1077 printf(
"Step: %d\tAction: %.16e\ttheta: %.16e\tDelta: %.16e\n", iter, argQ.getAction(), argQ.getTheta(), diff);
1081 const double max_error = 1
e-10;
1097 errorQuda(
"Error in the unitarization\n");
1112 double fftflop = 5.0 * (
log2((
double)( data.
X()[0] * data.
X()[1]) ) +
log2( (
double)(data.
X()[2] * data.
X()[3] )));
1113 fftflop *= (
double)( data.
X()[0] * data.
X()[1] * data.
X()[2] * data.
X()[3] );
1114 double gflops = setinvpsp.flops() + gfixquality.flops();
1115 double gbytes = setinvpsp.bytes() + gfixquality.bytes();
1116 double flop = invpsp.flops() * Elems;
1117 double byte = invpsp.bytes() * Elems;
1118 flop += (GFRotate.flops() + fftflop) * Elems * 2;
1119 byte += GFRotate.bytes() * Elems * 4;
1120 #ifdef GAUGEFIXING_DONT_USE_GX 1121 flop += gfixNew.flops();
1122 byte += gfixNew.bytes();
1124 flop += calcGX.flops();
1125 byte += calcGX.bytes();
1126 flop += gfix.flops();
1127 byte += gfix.bytes();
1129 flop += gfixquality.flops();
1130 byte += gfixquality.bytes();
1131 gflops += flop * iter;
1132 gbytes += byte * iter;
1133 gflops += 4588.0 * data.
X()[0]*data.
X()[1]*data.
X()[2]*data.
X()[3];
1134 gbytes += 8.0 * data.
X()[0]*data.
X()[1]*data.
X()[2]*data.
X()[3] * dataOr.Bytes() ;
1136 gflops = (gflops * 1
e-9) / (secs);
1137 gbytes = gbytes / (secs * 1e9);
1138 printfQuda(
"Time: %6.6f s, Gflop/s = %6.1f, GB/s = %6.1f\n", secs, gflops, gbytes);
1142 template<
int Elems,
typename Float,
typename Gauge>
1144 const int Nsteps,
const int verbose_interval,
const Float alpha,
const int autotune, \
1145 const double tolerance,
const int stopWtheta) {
1146 if ( gauge_dir != 3 ) {
1147 printf(
"Starting Landau gauge fixing with FFTs...\n");
1148 gaugefixingFFT<Elems, Float, Gauge, 4>(dataOr, data, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);
1151 printf(
"Starting Coulomb gauge fixing with FFTs...\n");
1152 gaugefixingFFT<Elems, Float, Gauge, 3>(dataOr, data, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);
1158 template<
typename Float>
1160 const int Nsteps,
const int verbose_interval,
const Float alpha,
const int autotune, \
1161 const double tolerance,
const int stopWtheta) {
1170 gaugefixingFFT<9, Float>(Gauge(data), data, gauge_dir, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);
1174 gaugefixingFFT<6, Float>(Gauge(data), data, gauge_dir, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);
1178 gaugefixingFFT<6, Float>(Gauge(data), data, gauge_dir, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);
1188 #endif // GPU_GAUGE_ALG 1203 const int Nsteps,
const int verbose_interval,
const double alpha,
const int autotune, \
1204 const double tolerance,
const int stopWtheta) {
1206 #ifdef GPU_GAUGE_ALG 1209 errorQuda(
"Gauge Fixing with FFTs in multi-GPU support NOT implemented yet!\n");
1212 errorQuda(
"Half precision not supported\n");
1215 gaugefixingFFT<float> (data, gauge_dir, Nsteps, verbose_interval, (
float)alpha, autotune, tolerance, stopWtheta);
1217 gaugefixingFFT<double>(data, gauge_dir, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);
1222 errorQuda(
"Gauge fixing has bot been built");
static __device__ __host__ int getIndexFull(int cb_index, const I X[4], int parity)
#define qudaMemcpy(dst, src, count, kind)
__device__ __host__ void setZero(Matrix< T, N > *m)
#define LAUNCH_KERNEL_LOCAL_PARITY(kernel, tp, stream, arg,...)
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
QudaVerbosity getVerbosity()
void setUnitarizeLinksConstants(double unitarize_eps, double max_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error)
__host__ __device__ ValueType sqrt(ValueType x)
void SetPlanFFT2DMany(cufftHandle &plan, int4 size, int dim, float2 *data)
Creates a CUFFT plan supporting 4D (2D+2D) data layouts for single-precision complex-to-complex.
static bool reunit_svd_only
cudaColorSpinorField * tmp
__device__ __host__ double getRealTraceUVdagger(const Matrix< T, 3 > &a, const Matrix< T, 3 > &b)
static double svd_rel_error
void exit(int) __attribute__((noreturn))
static unsigned int delta
void unitarizeLinks(cudaGaugeField &outfield, const cudaGaugeField &infield, int *fails)
int printf(const char *,...) __attribute__((__format__(__printf__
static __device__ __host__ int linkIndexM1(const int x[], const I X[4], const int mu)
__host__ __device__ ValueType sin(ValueType x)
def id
projector matrices ######################################################################## ...
for(int s=0;s< param.dc.Ls;s++)
#define pool_device_malloc(size)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
static double unitarize_eps
Main header file for host and device accessors to GaugeFields.
__device__ __host__ void SubTraceUnit(Matrix< T, 3 > &a)
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
__device__ __host__ void setIdentity(Matrix< T, N > *m)
void ApplyFFT(cufftHandle &plan, float2 *data_in, float2 *data_out, int direction)
Call CUFFT to perform a single-precision complex-to-complex transform plan in the transform direction...
static bool reunit_allow_svd
void gaugefixingFFT(cudaGaugeField &data, const int gauge_dir, const int Nsteps, const int verbose_interval, const double alpha, const int autotune, const double tolerance, const int stopWtheta)
Gauge fixing with Steepest descent method with FFTs with support for single GPU only.
static double svd_abs_error
int sprintf(char *, const char *,...) __attribute__((__format__(__printf__
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
#define device_malloc(size)
QudaReconstructType Reconstruct() const
__host__ __device__ ValueType abs(ValueType x)
#define pool_device_free(ptr)
__host__ __device__ ValueType conj(ValueType x)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
QudaPrecision Precision() const
static __device__ __host__ int linkIndexP1(const int x[], const I X[4], const int mu)
static __device__ __host__ int linkNormalIndexP1(const int x[], const I X[4], const int mu)
#define CUFFT_SAFE_CALL(call)
int comm_dim_partitioned(int dim)
static __device__ __host__ void getCoords(int x[], int cb_index, const I X[], int parity)