23 #define GAUGEFIXING_DONT_USE_GX 29 #ifdef GAUGEFIXING_DONT_USE_GX 30 #warning Not using precalculated g(x) 32 #warning Using precalculated g(x) 37 #ifndef FL_UNITARIZE_PI 38 #define FL_UNITARIZE_PI 3.14159265358979323846 41 template <
typename Float>
42 struct GaugeFixFFTRotateArg {
48 for (
int dir = 0; dir < 4; ++dir ) X[dir] = data.
X()[dir];
49 threads = X[0] * X[1] * X[2] * X[3];
55 template <
int direction,
typename Float>
56 __global__
void fft_rotate_kernel_2D2D(GaugeFixFFTRotateArg<Float>
arg){
57 int id = blockIdx.x * blockDim.x + threadIdx.x;
58 if (
id >= arg.threads )
return;
59 if ( direction == 0 ) {
60 int x3 =
id / (arg.X[0] * arg.X[1] * arg.X[2]);
61 int x2 = (
id / (arg.X[0] * arg.X[1])) % arg.X[2];
62 int x1 = (
id / arg.X[0]) % arg.X[1];
63 int x0 =
id % arg.X[0];
65 int id = x0 + (x1 + (x2 + x3 * arg.X[2]) * arg.X[1]) * arg.X[0];
66 int id_out = x2 + (x3 + (x0 + x1 * arg.X[0]) * arg.X[3]) * arg.X[2];
67 arg.tmp1[id_out] = arg.tmp0[
id];
70 if ( direction == 1 ) {
72 int x1 =
id / (arg.X[2] * arg.X[3] * arg.X[0]);
73 int x0 = (
id / (arg.X[2] * arg.X[3])) % arg.X[0];
74 int x3 = (
id / arg.X[2]) % arg.X[3];
75 int x2 =
id % arg.X[2];
77 int id = x2 + (x3 + (x0 + x1 * arg.X[0]) * arg.X[3]) * arg.X[2];
78 int id_out = x0 + (x1 + (x2 + x3 * arg.X[2]) * arg.X[1]) * arg.X[0];
79 arg.tmp1[id_out] = arg.tmp0[
id];
89 template<
typename Float>
90 class GaugeFixFFTRotate :
Tunable {
91 GaugeFixFFTRotateArg<Float>
arg;
93 mutable char aux_string[128];
95 unsigned int sharedBytesPerThread()
const {
102 bool tuneGridDim()
const {
105 unsigned int minThreads()
const {
110 GaugeFixFFTRotate(GaugeFixFFTRotateArg<Float> &arg) : arg(arg) {
113 ~GaugeFixFFTRotate () {
115 void setDirection(
int dir, complex<Float> *data_in, complex<Float> *data_out){
121 void apply(
const cudaStream_t &
stream){
123 if ( direction == 0 )
124 fft_rotate_kernel_2D2D<0, Float > <<< tp.
grid, tp.
block, 0, stream >>> (
arg);
125 else if ( direction == 1 )
126 fft_rotate_kernel_2D2D<1, Float > <<< tp.
grid, tp.
block, 0, stream >>> (
arg);
128 errorQuda(
"Error in GaugeFixFFTRotate option.\n");
132 std::stringstream vol;
133 vol << arg.X[0] <<
"x";
134 vol << arg.X[1] <<
"x";
135 vol << arg.X[2] <<
"x";
137 sprintf(aux_string,
"threads=%d,prec=%lu", arg.threads,
sizeof(Float));
138 return TuneKey(vol.str().c_str(),
typeid(*this).name(), aux_string);
142 long long flops()
const {
145 long long bytes()
const {
146 return 4LL *
sizeof(Float) * arg.threads;
152 template <
typename Float,
typename Gauge>
153 struct GaugeFixQualityArg :
public ReduceArg<double2> {
157 complex<Float> *delta;
159 GaugeFixQualityArg(
const Gauge &dataOr,
const cudaGaugeField &data, complex<Float> * delta)
160 :
ReduceArg<double2>(), dataOr(dataOr), delta(delta) {
161 for (
int dir = 0; dir < 4; ++dir ) X[dir] = data.
X()[dir];
164 double getAction(){
return result_h[0].x; }
165 double getTheta(){
return result_h[0].y; }
168 template<
int blockSize,
int Elems,
typename Float,
typename Gauge,
int gauge_dir>
169 __global__
void computeFix_quality(GaugeFixQualityArg<Float, Gauge> argQ){
170 int idx_cb = threadIdx.x + blockIdx.x * blockDim.x;
173 double2 data = make_double2(0.0,0.0);
174 while (idx_cb < argQ.threads) {
175 typedef complex<Float> Cmplx;
182 for (
int mu = 0;
mu < gauge_dir;
mu++ ) {
187 data.x += -delta(0, 0).x - delta(1, 1).x - delta(2, 2).x;
189 for (
int mu = 0;
mu < gauge_dir;
mu++ ) {
194 delta -=
conj(delta);
200 argQ.delta[idx] = delta(0,0);
201 argQ.delta[idx + 2 * argQ.threads] = delta(0,1);
202 argQ.delta[idx + 4 * argQ.threads] = delta(0,2);
203 argQ.delta[idx + 6 * argQ.threads] = delta(1,1);
204 argQ.delta[idx + 8 * argQ.threads] = delta(1,2);
205 argQ.delta[idx + 10 * argQ.threads] = delta(2,2);
211 idx_cb += blockDim.x * gridDim.x;
214 reduce2d<blockSize,2>(argQ, data);
219 template<
int Elems,
typename Float,
typename Gauge,
int gauge_dir>
221 GaugeFixQualityArg<Float, Gauge> argQ;
222 mutable char aux_string[128];
225 bool tuneGridDim()
const {
return true; }
228 GaugeFixQuality(GaugeFixQualityArg<Float, Gauge> &argQ)
231 ~GaugeFixQuality () { }
233 void apply(
const cudaStream_t &stream){
235 argQ.result_h[0] = make_double2(0.0,0.0);
238 argQ.result_h[0].x /= (double)(3 * gauge_dir * 2 * argQ.threads);
239 argQ.result_h[0].y /= (double)(3 * 2 * argQ.threads);
243 std::stringstream vol;
244 vol << argQ.X[0] <<
"x" << argQ.X[1] <<
"x" << argQ.X[2] <<
"x" << argQ.X[3];
245 sprintf(aux_string,
"threads=%d,prec=%lu,gaugedir=%d", argQ.threads,
sizeof(Float), gauge_dir);
246 return TuneKey(vol.str().c_str(),
typeid(*this).name(), aux_string);
249 long long flops()
const {
250 return (36LL * gauge_dir + 65LL) * 2 * argQ.threads;
252 long long bytes()
const {
253 return (2LL * gauge_dir + 2LL) * Elems * 2 * argQ.threads *
sizeof(Float);
260 template <
typename Float>
266 complex<Float> *delta;
269 GaugeFixArg(
cudaGaugeField & data,
const int Elems) : data(data){
270 for (
int dir = 0; dir < 4; ++dir ) X[dir] = data.
X()[dir];
271 threads = X[0] * X[1] * X[2] * X[3];
273 delta = (complex<Float>*)
device_malloc(
sizeof(complex<Float>) * threads * 6);
274 #ifdef GAUGEFIXING_DONT_USE_GX 275 gx = (complex<Float>*)
device_malloc(
sizeof(complex<Float>) * threads);
277 gx = (complex<Float>*)
device_malloc(
sizeof(complex<Float>) * threads * Elems);
290 template <
typename Float>
291 __global__
void kernel_gauge_set_invpsq(GaugeFixArg<Float> arg){
292 int id = blockIdx.x * blockDim.x + threadIdx.x;
293 if (
id >= arg.threads )
return;
294 int x1 =
id / (arg.X[2] * arg.X[3] * arg.X[0]);
295 int x0 = (
id / (arg.X[2] * arg.X[3])) % arg.X[0];
296 int x3 = (
id / arg.X[2]) % arg.X[3];
297 int x2 =
id % arg.X[2];
299 Float sx =
sin( (Float)x0 * FL_UNITARIZE_PI / (Float)arg.X[0]);
300 Float sy =
sin( (Float)x1 * FL_UNITARIZE_PI / (Float)arg.X[1]);
301 Float sz =
sin( (Float)x2 * FL_UNITARIZE_PI / (Float)arg.X[2]);
302 Float st =
sin( (Float)x3 * FL_UNITARIZE_PI / (Float)arg.X[3]);
303 Float sinsq = sx * sx + sy * sy + sz * sz + st * st;
306 if ( sinsq > 0.00001 ) prcfact = 4.0 / (sinsq * (Float)arg.threads);
307 arg.invpsq[id] = prcfact;
311 template<
typename Float>
312 class GaugeFixSETINVPSP :
Tunable {
313 GaugeFixArg<Float>
arg;
314 mutable char aux_string[128];
316 unsigned int sharedBytesPerThread()
const {
322 bool tuneSharedBytes()
const {
325 bool tuneGridDim()
const {
328 unsigned int minThreads()
const {
333 GaugeFixSETINVPSP(GaugeFixArg<Float> &arg) : arg(arg) { }
334 ~GaugeFixSETINVPSP () { }
336 void apply(
const cudaStream_t &stream){
338 kernel_gauge_set_invpsq<Float> <<< tp.
grid, tp.
block, 0, stream >>> (
arg);
342 std::stringstream vol;
343 vol << arg.X[0] <<
"x";
344 vol << arg.X[1] <<
"x";
345 vol << arg.X[2] <<
"x";
347 sprintf(aux_string,
"threads=%d,prec=%lu", arg.threads,
sizeof(Float));
348 return TuneKey(vol.str().c_str(),
typeid(*this).name(), aux_string);
352 long long flops()
const {
353 return 21 * arg.threads;
355 long long bytes()
const {
356 return sizeof(Float) * arg.threads;
361 template<
typename Float>
362 __global__
void kernel_gauge_mult_norm_2D(GaugeFixArg<Float> arg){
363 int id = blockIdx.x * blockDim.x + threadIdx.x;
364 if (
id < arg.threads ) arg.gx[id] = arg.gx[id] * arg.invpsq[id];
368 template<
typename Float>
369 class GaugeFixINVPSP :
Tunable {
370 GaugeFixArg<Float>
arg;
371 mutable char aux_string[128];
373 unsigned int sharedBytesPerThread()
const {
380 bool tuneGridDim()
const {
383 unsigned int minThreads()
const {
388 GaugeFixINVPSP(GaugeFixArg<Float> &arg)
390 cudaFuncSetCacheConfig( kernel_gauge_mult_norm_2D<Float>, cudaFuncCachePreferL1);
395 void apply(
const cudaStream_t &stream){
397 kernel_gauge_mult_norm_2D<Float> <<< tp.
grid, tp.
block, 0, stream >>> (
arg);
401 std::stringstream vol;
402 vol << arg.X[0] <<
"x";
403 vol << arg.X[1] <<
"x";
404 vol << arg.X[2] <<
"x";
406 sprintf(aux_string,
"threads=%d,prec=%lu", arg.threads,
sizeof(Float));
407 return TuneKey(vol.str().c_str(),
typeid(*this).name(), aux_string);
413 complex<Float> *
tmp = arg.gx;
420 long long flops()
const {
421 return 2LL * arg.threads;
423 long long bytes()
const {
424 return 5LL *
sizeof(Float) * arg.threads;
431 template <
typename Float>
432 __host__ __device__
inline void reunit_link(
Matrix<complex<Float>,3> &U ){
434 complex<Float> t2((Float)0.0, (Float)0.0);
439 for (
int c = 0; c < 3; c++ ) t1 +=
norm(U(0,c));
440 t1 = (Float)1.0 /
sqrt(t1);
444 for (
int c = 0; c < 3; c++ ) U(0,c) *= t1;
447 for (
int c = 0; c < 3; c++ ) t2 +=
conj(U(0,c)) * U(1,c);
450 for (
int c = 0; c < 3; c++ ) U(1,c) -= t2 * U(0,c);
456 for (
int c = 0; c < 3; c++ ) t1 +=
norm(U(1,c));
457 t1 = (Float)1.0 /
sqrt(t1);
461 for (
int c = 0; c < 3; c++ ) U(1, c) *= t1;
464 U(2,0) =
conj(U(0,1) * U(1,2) - U(0,2) * U(1,1));
465 U(2,1) =
conj(U(0,2) * U(1,0) - U(0,0) * U(1,2));
466 U(2,2) =
conj(U(0,0) * U(1,1) - U(0,1) * U(1,0));
471 #ifdef GAUGEFIXING_DONT_USE_GX 473 template <
typename Float,
typename Gauge>
474 __global__
void kernel_gauge_fix_U_EO_NEW( GaugeFixArg<Float> arg, Gauge dataOr, Float half_alpha){
475 int id = threadIdx.x + blockIdx.x * blockDim.x;
476 int parity = threadIdx.y;
478 if (
id >= arg.threads/2 )
return;
480 typedef complex<Float> Cmplx;
484 int idx = ((x[3] * arg.X[2] + x[2]) * arg.X[1] + x[1]) * arg.X[0] + x[0];
487 de(0,0) = arg.delta[idx + 0 * arg.threads];
488 de(0,1) = arg.delta[idx + 1 * arg.threads];
489 de(0,2) = arg.delta[idx + 2 * arg.threads];
490 de(1,1) = arg.delta[idx + 3 * arg.threads];
491 de(1,2) = arg.delta[idx + 4 * arg.threads];
492 de(2,2) = arg.delta[idx + 5 * arg.threads];
494 de(1,0) = Cmplx(-de(0,1).x, de(0,1).y);
495 de(2,0) = Cmplx(-de(0,2).x, de(0,2).y);
496 de(2,1) = Cmplx(-de(1,2).x, de(1,2).y);
499 g += de * half_alpha;
501 reunit_link<Float>( g );
505 for (
int mu = 0;
mu < 4;
mu++ ) {
512 de(0,0) = arg.delta[idx + 0 * arg.threads];
513 de(0,1) = arg.delta[idx + 1 * arg.threads];
514 de(0,2) = arg.delta[idx + 2 * arg.threads];
515 de(1,1) = arg.delta[idx + 3 * arg.threads];
516 de(1,2) = arg.delta[idx + 4 * arg.threads];
517 de(2,2) = arg.delta[idx + 5 * arg.threads];
519 de(1,0) = Cmplx(-de(0,1).x, de(0,1).y);
520 de(2,0) = Cmplx(-de(0,2).x, de(0,2).y);
521 de(2,1) = Cmplx(-de(1,2).x, de(1,2).y);
524 g0 += de * half_alpha;
526 reunit_link<Float>( g0 );
531 dataOr(
mu,
id, parity) = U;
536 template<
typename Float,
typename Gauge>
538 GaugeFixArg<Float>
arg;
541 mutable char aux_string[128];
547 unsigned int minThreads()
const {
return arg.threads/2; }
550 GaugeFixNEW(Gauge & dataOr, GaugeFixArg<Float> &arg, Float alpha)
551 : dataOr(dataOr), arg(arg) {
552 half_alpha = alpha * 0.5;
553 cudaFuncSetCacheConfig( kernel_gauge_fix_U_EO_NEW<Float, Gauge>, cudaFuncCachePreferL1);
557 void setAlpha(Float alpha){ half_alpha = alpha * 0.5; }
559 void apply(
const cudaStream_t &stream){
561 kernel_gauge_fix_U_EO_NEW<Float, Gauge> <<< tp.
grid, tp.
block, 0, stream >>> (
arg, dataOr, half_alpha);
565 std::stringstream vol;
566 vol << arg.X[0] <<
"x" << arg.X[1] <<
"x" << arg.X[2] <<
"x" << arg.X[3];
567 sprintf(aux_string,
"threads=%d,prec=%lu", arg.threads,
sizeof(Float));
568 return TuneKey(vol.str().c_str(),
typeid(*this).name(), aux_string);
579 long long flops()
const {
580 return 2414LL * arg.threads;
583 long long bytes()
const {
584 return ( dataOr.Bytes() * 4LL + 5 * 12LL *
sizeof(Float)) * arg.threads;
592 template <
int Elems,
typename Float>
593 __global__
void kernel_gauge_GX(GaugeFixArg<Float> arg, Float half_alpha){
595 int id = blockIdx.x * blockDim.x + threadIdx.x;
597 if (
id >= arg.threads )
return;
599 typedef complex<Float> Cmplx;
603 de(0,0) = arg.delta[id];
604 de(0,1) = arg.delta[
id + arg.threads];
605 de(0,2) = arg.delta[
id + 2 * arg.threads];
606 de(1,1) = arg.delta[
id + 3 * arg.threads];
607 de(1,2) = arg.delta[
id + 4 * arg.threads];
608 de(2,2) = arg.delta[
id + 5 * arg.threads];
610 de(1,0) = makeComplex(-de(0,1).x, de(0,1).y);
611 de(2,0) = makeComplex(-de(0,2).x, de(0,2).y);
612 de(2,1) = makeComplex(-de(1,2).x, de(1,2).y);
617 g += de * half_alpha;
619 reunit_link<Float>( g );
623 int x3 =
id / (arg.X[0] * arg.X[1] * arg.X[2]);
624 int x2 = (
id / (arg.X[0] * arg.X[1])) % arg.X[2];
625 int x1 = (
id / arg.X[0]) % arg.X[1];
626 int x0 =
id % arg.X[0];
627 id = (x0 + (x1 + (x2 + x3 * arg.X[2]) * arg.X[1]) * arg.X[0]) >> 1;
628 id += ((x0 + x1 + x2 + x3) & 1 ) * arg.threads / 2;
630 for (
int i = 0; i < Elems; i++ ) arg.gx[
id + i * arg.threads] = g.
data[i];
638 template<
int Elems,
typename Float>
640 GaugeFixArg<Float>
arg;
642 mutable char aux_string[128];
644 unsigned int sharedBytesPerThread()
const {
651 bool tuneGridDim()
const {
654 unsigned int minThreads()
const {
659 GaugeFix_GX(GaugeFixArg<Float> &arg, Float alpha)
661 half_alpha = alpha * 0.5;
662 cudaFuncSetCacheConfig( kernel_gauge_GX<Elems, Float>, cudaFuncCachePreferL1);
667 void setAlpha(Float alpha){
668 half_alpha = alpha * 0.5;
672 void apply(
const cudaStream_t &stream){
674 kernel_gauge_GX<Elems, Float> <<< tp.
grid, tp.
block, 0, stream >>> (
arg, half_alpha);
678 std::stringstream vol;
679 vol << arg.X[0] <<
"x";
680 vol << arg.X[1] <<
"x";
681 vol << arg.X[2] <<
"x";
683 sprintf(aux_string,
"threads=%d,prec=%lu", arg.threads,
sizeof(Float));
684 return TuneKey(vol.str().c_str(),
typeid(*this).name(), aux_string);
688 long long flops()
const {
689 if ( Elems == 6 )
return 208LL * arg.threads;
690 else return 166LL * arg.threads;
692 long long bytes()
const {
693 return 4LL * Elems *
sizeof(Float) * arg.threads;
699 template <
int Elems,
typename Float,
typename Gauge>
700 __global__
void kernel_gauge_fix_U_EO( GaugeFixArg<Float> arg, Gauge dataOr){
701 int idd = threadIdx.x + blockIdx.x * blockDim.x;
703 if ( idd >= arg.threads )
return;
707 if ( idd >= arg.threads / 2 ) {
709 id -= arg.threads / 2;
711 typedef complex<Float> Cmplx;
715 for (
int i = 0; i < Elems; i++ ) {
716 g.
data[i] = arg.gx[idd + i * arg.threads];
719 g(2,0) =
conj(g(0,1) * g(1,2) - g(0,2) * g(1,1));
720 g(2,1) =
conj(g(0,2) * g(1,0) - g(0,0) * g(1,2));
721 g(2,2) =
conj(g(0,0) * g(1,1) - g(0,1) * g(1,0));
726 for (
int mu = 0;
mu < 4;
mu++ ) {
732 idm1 += (1 -
parity) * arg.threads / 2;
734 for (
int i = 0; i < Elems; i++ ) {
735 g0.
data[i] = arg.gx[idm1 + i * arg.threads];
738 g0(2,0) =
conj(g0(0,1) * g0(1,2) - g0(0,2) * g0(1,1));
739 g0(2,1) =
conj(g0(0,2) * g0(1,0) - g0(0,0) * g0(1,2));
740 g0(2,2) =
conj(g0(0,0) * g0(1,1) - g0(0,1) * g0(1,0));
745 dataOr.save(
mu,
id, parity) = U;
753 template<
int Elems,
typename Float,
typename Gauge>
755 GaugeFixArg<Float>
arg;
757 mutable char aux_string[128];
759 unsigned int sharedBytesPerThread()
const {
766 bool tuneGridDim()
const {
769 unsigned int minThreads()
const {
774 GaugeFix(Gauge & dataOr, GaugeFixArg<Float> &arg)
775 : dataOr(dataOr), arg(arg) {
776 cudaFuncSetCacheConfig( kernel_gauge_fix_U_EO<Elems, Float, Gauge>, cudaFuncCachePreferL1);
781 void apply(
const cudaStream_t &stream){
783 kernel_gauge_fix_U_EO<Elems, Float, Gauge> <<< tp.
grid, tp.
block, 0, stream >>> (
arg, dataOr);
787 std::stringstream vol;
788 vol << arg.X[0] <<
"x";
789 vol << arg.X[1] <<
"x";
790 vol << arg.X[2] <<
"x";
792 sprintf(aux_string,
"threads=%d,prec=%lu", arg.threads,
sizeof(Float));
793 return TuneKey(vol.str().c_str(),
typeid(*this).name(), aux_string);
804 long long flops()
const {
805 if ( Elems == 6 )
return 1794LL * arg.threads;
806 else return 1536LL * arg.threads;
809 long long bytes()
const {
810 return 26LL * Elems *
sizeof(Float) * arg.threads;
818 template<
int Elems,
typename Float,
typename Gauge,
int gauge_dir>
820 const int Nsteps,
const int verbose_interval, \
821 const Float alpha0,
const int autotune,
const double tolerance, \
822 const int stopWtheta) {
824 TimeProfile profileInternalGaugeFixFFT(
"InternalGaugeFixQudaFFT",
false);
828 Float alpha = alpha0;
829 std::cout <<
"\tAlpha parameter of the Steepest Descent Method: " << alpha << std::endl;
830 if ( autotune ) std::cout <<
"\tAuto tune active: yes" << std::endl;
831 else std::cout <<
"\tAuto tune active: no" << std::endl;
832 std::cout <<
"\tStop criterium: " << tolerance << std::endl;
833 if ( stopWtheta ) std::cout <<
"\tStop criterium method: theta" << std::endl;
834 else std::cout <<
"\tStop criterium method: Delta" << std::endl;
835 std::cout <<
"\tMaximum number of iterations: " << Nsteps << std::endl;
836 std::cout <<
"\tPrint convergence results at every " << verbose_interval <<
" steps" << std::endl;
839 unsigned int delta_pad = data.
X()[0] * data.
X()[1] * data.
X()[2] * data.
X()[3];
840 int4
size = make_int4( data.
X()[0], data.
X()[1], data.
X()[2], data.
X()[3] );
844 GaugeFixArg<Float>
arg(data, Elems);
849 GaugeFixFFTRotateArg<Float> arg_rotate(data);
850 GaugeFixFFTRotate<Float> GFRotate(arg_rotate);
852 GaugeFixSETINVPSP<Float> setinvpsp(arg);
854 GaugeFixINVPSP<Float> invpsp(arg);
857 #ifdef GAUGEFIXING_DONT_USE_GX 859 GaugeFixNEW<Float, Gauge> gfixNew(dataOr, arg, alpha);
862 GaugeFix_GX<Elems, Float> calcGX(arg, alpha);
863 GaugeFix<Elems, Float, Gauge> gfix(dataOr, arg);
866 GaugeFixQualityArg<Float, Gauge> argQ(dataOr, data, arg.delta);
867 GaugeFixQuality<Elems, Float, Gauge, gauge_dir> gfixquality(argQ);
869 gfixquality.apply(0);
870 double action0 = argQ.getAction();
871 printf(
"Step: %d\tAction: %.16e\ttheta: %.16e\n", 0, argQ.getAction(), argQ.getTheta());
875 for ( iter = 0; iter < Nsteps; iter++ ) {
876 for (
int k = 0; k < 6; k++ ) {
882 complex<Float> *_array = arg.delta + k * delta_pad;
887 ApplyFFT(plan_xy, _array, arg.gx, CUFFT_FORWARD);
891 GFRotate.setDirection(0, arg.gx, _array);
896 ApplyFFT(plan_zt, _array, arg.gx, CUFFT_FORWARD);
904 ApplyFFT(plan_zt, arg.gx, _array, CUFFT_INVERSE);
908 GFRotate.setDirection(1, _array, arg.gx);
913 ApplyFFT(plan_xy, arg.gx, _array, CUFFT_INVERSE);
915 #ifdef GAUGEFIXING_DONT_USE_GX 933 gfixquality.apply(0);
934 double action = argQ.getAction();
935 diff =
abs(action0 - action);
936 if ((iter % verbose_interval) == (verbose_interval - 1))
937 printf(
"Step: %d\tAction: %.16e\ttheta: %.16e\tDelta: %.16e\n", iter + 1, argQ.getAction(), argQ.getTheta(), diff);
938 if ( autotune && ((action - action0) < -1e-14) ) {
939 if ( alpha > 0.01 ) {
940 alpha = 0.95 * alpha;
941 #ifdef GAUGEFIXING_DONT_USE_GX 942 gfixNew.setAlpha(alpha);
944 calcGX.setAlpha(alpha);
946 printf(
">>>>>>>>>>>>>> Warning: changing alpha down -> %.4e\n", alpha );
952 if ( stopWtheta ) {
if ( argQ.getTheta() < tolerance )
break; }
953 else {
if ( diff < tolerance )
break; }
957 if ((iter % verbose_interval) != 0 )
958 printf(
"Step: %d\tAction: %.16e\ttheta: %.16e\tDelta: %.16e\n", iter, argQ.getAction(), argQ.getTheta(), diff);
962 const double max_error = 1e-10;
968 reunit_allow_svd, reunit_svd_only,
969 svd_rel_error, svd_abs_error);
972 cudaMemset(num_failures_dev, 0,
sizeof(
int));
974 qudaMemcpy(&num_failures, num_failures_dev,
sizeof(
int), cudaMemcpyDeviceToHost);
977 if ( num_failures > 0 ) {
978 errorQuda(
"Error in the unitarization\n");
993 double fftflop = 5.0 * (log2((
double)( data.
X()[0] * data.
X()[1]) ) + log2( (
double)(data.
X()[2] * data.
X()[3] )));
994 fftflop *= (double)( data.
X()[0] * data.
X()[1] * data.
X()[2] * data.
X()[3] );
995 double gflops = setinvpsp.flops() + gfixquality.flops();
996 double gbytes = setinvpsp.bytes() + gfixquality.bytes();
997 double flop = invpsp.flops() * Elems;
998 double byte = invpsp.bytes() * Elems;
999 flop += (GFRotate.flops() + fftflop) * Elems * 2;
1000 byte += GFRotate.bytes() * Elems * 4;
1001 #ifdef GAUGEFIXING_DONT_USE_GX 1002 flop += gfixNew.flops();
1003 byte += gfixNew.bytes();
1005 flop += calcGX.flops();
1006 byte += calcGX.bytes();
1007 flop += gfix.flops();
1008 byte += gfix.bytes();
1010 flop += gfixquality.flops();
1011 byte += gfixquality.bytes();
1012 gflops += flop * iter;
1013 gbytes += byte * iter;
1014 gflops += 4588.0 * data.
X()[0]*data.
X()[1]*data.
X()[2]*data.
X()[3];
1015 gbytes += 8.0 * data.
X()[0]*data.
X()[1]*data.
X()[2]*data.
X()[3] * dataOr.Bytes() ;
1017 gflops = (gflops * 1e-9) / (secs);
1018 gbytes = gbytes / (secs * 1e9);
1019 printfQuda(
"Time: %6.6f s, Gflop/s = %6.1f, GB/s = %6.1f\n", secs, gflops, gbytes);
1023 template<
int Elems,
typename Float,
typename Gauge>
1025 const int Nsteps,
const int verbose_interval,
const Float alpha,
const int autotune, \
1026 const double tolerance,
const int stopWtheta) {
1027 if ( gauge_dir != 3 ) {
1028 printf(
"Starting Landau gauge fixing with FFTs...\n");
1029 gaugefixingFFT<Elems, Float, Gauge, 4>(dataOr, data, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);
1032 printf(
"Starting Coulomb gauge fixing with FFTs...\n");
1033 gaugefixingFFT<Elems, Float, Gauge, 3>(dataOr, data, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);
1039 template<
typename Float>
1041 const int Nsteps,
const int verbose_interval,
const Float alpha,
const int autotune, \
1042 const double tolerance,
const int stopWtheta) {
1051 gaugefixingFFT<9, Float>(Gauge(data), data, gauge_dir, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);
1055 gaugefixingFFT<6, Float>(Gauge(data), data, gauge_dir, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);
1059 gaugefixingFFT<6, Float>(Gauge(data), data, gauge_dir, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);
1069 #endif // GPU_GAUGE_ALG 1084 const int Nsteps,
const int verbose_interval,
const double alpha,
const int autotune, \
1085 const double tolerance,
const int stopWtheta) {
1087 #ifdef GPU_GAUGE_ALG 1090 errorQuda(
"Gauge Fixing with FFTs in multi-GPU support NOT implemented yet!\n");
1093 errorQuda(
"Half precision not supported\n");
1096 gaugefixingFFT<float> (data, gauge_dir, Nsteps, verbose_interval, (float)alpha, autotune, tolerance, stopWtheta);
1098 gaugefixingFFT<double>(data, gauge_dir, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);
1103 errorQuda(
"Gauge fixing has bot been built");
static bool reunit_allow_svd
static __device__ __host__ int getIndexFull(int cb_index, const I X[4], int parity)
#define qudaMemcpy(dst, src, count, kind)
cudaColorSpinorField * tmp1
__device__ __host__ void setZero(Matrix< T, N > *m)
#define LAUNCH_KERNEL_LOCAL_PARITY(kernel, tp, stream, arg,...)
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
QudaVerbosity getVerbosity()
void setUnitarizeLinksConstants(double unitarize_eps, double max_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error)
__host__ __device__ ValueType sqrt(ValueType x)
void SetPlanFFT2DMany(cufftHandle &plan, int4 size, int dim, float2 *data)
Creates a CUFFT plan supporting 4D (2D+2D) data layouts for single-precision complex-to-complex.
cudaColorSpinorField * tmp
__device__ __host__ double getRealTraceUVdagger(const Matrix< T, 3 > &a, const Matrix< T, 3 > &b)
static double svd_rel_error
#define qudaDeviceSynchronize()
void unitarizeLinks(cudaGaugeField &outfield, const cudaGaugeField &infield, int *fails)
static __device__ __host__ int linkIndexM1(const int x[], const I X[4], const int mu)
__host__ __device__ ValueType sin(ValueType x)
static bool reunit_svd_only
#define pool_device_malloc(size)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
static double svd_abs_error
Main header file for host and device accessors to GaugeFields.
__device__ __host__ void SubTraceUnit(Matrix< T, 3 > &a)
__device__ __host__ void setIdentity(Matrix< T, N > *m)
void ApplyFFT(cufftHandle &plan, float2 *data_in, float2 *data_out, int direction)
Call CUFFT to perform a single-precision complex-to-complex transform plan in the transform direction...
void gaugefixingFFT(cudaGaugeField &data, const int gauge_dir, const int Nsteps, const int verbose_interval, const double alpha, const int autotune, const double tolerance, const int stopWtheta)
Gauge fixing with Steepest descent method with FFTs with support for single GPU only.
static double unitarize_eps
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
#define device_malloc(size)
QudaReconstructType Reconstruct() const
__host__ __device__ ValueType abs(ValueType x)
#define pool_device_free(ptr)
__host__ __device__ ValueType conj(ValueType x)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
QudaPrecision Precision() const
static __device__ __host__ int linkIndexP1(const int x[], const I X[4], const int mu)
static __device__ __host__ int linkNormalIndexP1(const int x[], const I X[4], const int mu)
#define CUFFT_SAFE_CALL(call)
int comm_dim_partitioned(int dim)
__host__ __device__ int getCoords(int coord[], const Arg &arg, int &idx, int parity, int &dim)
Compute the space-time coordinates we are at.