v0.9.0/doc/gauge__fix__fft_8cu_source.html

 #include <quda_internal.h>
 #include <quda_matrix.h>
 #include <tune_quda.h>
 #include <gauge_field.h>
 #include <gauge_field_order.h>
 #include <launch_kernel.cuh>
 #include <unitarization_links.h>
 #include <atomic.cuh>
 #include <cub_helper.cuh>
 #include <index_helper.cuh>

 #include <cufft.h>

 #ifdef GPU_GAUGE_ALG
 #include <CUFFT_Plans.h>
 #endif

 namespace quda {

 #ifdef GPU_GAUGE_ALG

 //Comment if you don't want to use textures for Delta(x) and g(x)
 #define GAUGEFIXING_SITE_MATRIX_LOAD_TEX

 //UNCOMMENT THIS IF YOU WAN'T TO USE LESS MEMORY
 #define GAUGEFIXING_DONT_USE_GX
 //Without using the precalculation of g(x),
 //we loose some performance, because Delta(x) is written in normal lattice coordinates need for the FFTs
 //and the gauge array in even/odd format


 #ifdef HOST_DEBUG
 #ifdef GAUGEFIXING_DONT_USE_GX
 #warning Not using precalculated g(x)
 #else
 #warning Using precalculated g(x)
 #endif
 #endif


 #ifndef FL_UNITARIZE_PI
 #define FL_UNITARIZE_PI 3.14159265358979323846
 #endif


   texture<float2, 1, cudaReadModeElementType> GXTexSingle;
   texture<int4, 1, cudaReadModeElementType> GXTexDouble;
 //Delta is only stored using 12 real number parameters,
 //  (0,0), (0,1), (0,2), (1,1), (1,2) and (2,2)
 //  (0,0), (1,1) and (0,1) don't have real part, however we need a complex for the FFTs
   texture<float2, 1, cudaReadModeElementType> DELTATexSingle;
   texture<int4, 1, cudaReadModeElementType> DELTATexDouble;


   template <class T>
   inline __device__ T TEXTURE_GX(int id){
     return 0.0;
   }
   template <>
   inline __device__ complex<float> TEXTURE_GX<complex<float> >(int id){
     return tex1Dfetch(GXTexSingle, id);
   }
   template <>
   inline __device__ complex<double> TEXTURE_GX<complex<double> >(int id){
     int4 u = tex1Dfetch(GXTexDouble, id);
     return complex<double>(__hiloint2double(u.y, u.x), __hiloint2double(u.w, u.z));
   }
   template <class T>
   inline __device__ T TEXTURE_DELTA(int id){
     return 0.0;
   }
   template <>
   inline __device__ complex<float> TEXTURE_DELTA<complex<float> >(int id){
     return tex1Dfetch(DELTATexSingle, id);
   }
   template <>
   inline __device__ complex<double> TEXTURE_DELTA<complex<double> >(int id){
     int4 u = tex1Dfetch(DELTATexDouble, id);
     return complex<double>(__hiloint2double(u.y, u.x), __hiloint2double(u.w, u.z));
   }

   static void BindTex(complex<float> *delta, complex<float> *gx, size_t bytes){
 #ifdef GAUGEFIXING_SITE_MATRIX_LOAD_TEX
 #ifndef GAUGEFIXING_DONT_USE_GX
     cudaBindTexture(0, GXTexSingle, gx, bytes);
 #endif
     cudaBindTexture(0, DELTATexSingle, delta, bytes);
 #endif
   }

   static void BindTex(complex<double> *delta, complex<double> *gx, size_t bytes){
 #ifdef GAUGEFIXING_SITE_MATRIX_LOAD_TEX
 #ifndef GAUGEFIXING_DONT_USE_GX
     cudaBindTexture(0, GXTexDouble, gx, bytes);
 #endif
     cudaBindTexture(0, DELTATexDouble, delta, bytes);
 #endif
   }

   static void UnBindTex(complex<float> *delta, complex<float> *gx){
 #ifdef GAUGEFIXING_SITE_MATRIX_LOAD_TEX
 #ifndef GAUGEFIXING_DONT_USE_GX
     cudaUnbindTexture(GXTexSingle);
 #endif
     cudaUnbindTexture(DELTATexSingle);
 #endif
   }

   static void UnBindTex(complex<double> *delta, complex<double> *gx){
 #ifdef GAUGEFIXING_SITE_MATRIX_LOAD_TEX
 #ifndef GAUGEFIXING_DONT_USE_GX
     cudaUnbindTexture(GXTexDouble);
 #endif
     cudaUnbindTexture(DELTATexDouble);
 #endif
   }


   template <typename Float>
   struct GaugeFixFFTRotateArg {
     int threads;     // number of active threads required
     int X[4];     // grid dimensions
     complex<Float> *tmp0;
     complex<Float> *tmp1;
     GaugeFixFFTRotateArg(const cudaGaugeField &data){
       for ( int dir = 0; dir < 4; ++dir ) X[dir] = data.X()[dir];
       threads = X[0] * X[1] * X[2] * X[3];
       tmp0 = 0;
       tmp1 = 0;
     }
   };


   template <int direction, typename Float>
   __global__ void fft_rotate_kernel_2D2D(GaugeFixFFTRotateArg<Float> arg){ //Cmplx *data_in, Cmplx *data_out){
     int id = blockIdx.x * blockDim.x + threadIdx.x;
     if ( id >= arg.threads ) return;
     if ( direction == 0 ) {
       int x3 = id / (arg.X[0] * arg.X[1] * arg.X[2]);
       int x2 = (id / (arg.X[0] * arg.X[1])) % arg.X[2];
       int x1 = (id / arg.X[0]) % arg.X[1];
       int x0 = id % arg.X[0];

       int id  =  x0 + (x1 + (x2 + x3 * arg.X[2]) * arg.X[1]) * arg.X[0];
       int id_out =  x2 + (x3 +  (x0 + x1 * arg.X[0]) * arg.X[3]) * arg.X[2];
       arg.tmp1[id_out] = arg.tmp0[id];
       //data_out[id_out] = data_in[id];
     }
     if ( direction == 1 ) {

       int x1 = id / (arg.X[2] * arg.X[3] * arg.X[0]);
       int x0 = (id / (arg.X[2] * arg.X[3])) % arg.X[0];
       int x3 = (id / arg.X[2]) % arg.X[3];
       int x2 = id % arg.X[2];

       int id  =  x2 + (x3 +  (x0 + x1 * arg.X[0]) * arg.X[3]) * arg.X[2];
       int id_out =  x0 + (x1 + (x2 + x3 * arg.X[2]) * arg.X[1]) * arg.X[0];
       arg.tmp1[id_out] = arg.tmp0[id];
       //data_out[id_out] = data_in[id];
     }
   }


   template<typename Float>
   class GaugeFixFFTRotate : Tunable {
     GaugeFixFFTRotateArg<Float> arg;
     int direction;
     mutable char aux_string[128];     // used as a label in the autotuner
     private:
     unsigned int sharedBytesPerThread() const {
       return 0;
     }
     unsigned int sharedBytesPerBlock(const TuneParam &param) const {
       return 0;
     }
     //bool tuneSharedBytes() const { return false; } // Don't tune shared memory
     bool tuneGridDim() const {
       return false;
     }                                              // Don't tune the grid dimensions.
     unsigned int minThreads() const {
       return arg.threads;
     }

     public:
     GaugeFixFFTRotate(GaugeFixFFTRotateArg<Float> &arg) : arg(arg) {
       direction = 0;
     }
     ~GaugeFixFFTRotate () {
     }
     void setDirection(int dir, complex<Float> *data_in, complex<Float> *data_out){
       direction = dir;
       arg.tmp0 = data_in;
       arg.tmp1 = data_out;
     }

     void apply(const cudaStream_t &stream){
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
       if ( direction == 0 )
         fft_rotate_kernel_2D2D<0, Float ><< < tp.grid, tp.block, 0, stream >> > (arg);
       else if ( direction == 1 )
         fft_rotate_kernel_2D2D<1, Float ><< < tp.grid, tp.block, 0, stream >> > (arg);
       else
         errorQuda("Error in GaugeFixFFTRotate option.\n");
     }

     TuneKey tuneKey() const {
       std::stringstream vol;
       vol << arg.X[0] << "x";
       vol << arg.X[1] << "x";
       vol << arg.X[2] << "x";
       vol << arg.X[3];
       sprintf(aux_string,"threads=%d,prec=%lu", arg.threads, sizeof(Float));
       return TuneKey(vol.str().c_str(), typeid(*this).name(), aux_string);

     }

     long long flops() const {
       return 0;
     }
     long long bytes() const {
       return 4LL * sizeof(Float) * arg.threads;
     }

   };


   template <typename Float, typename Gauge>
   struct GaugeFixQualityArg : public ReduceArg<double2> {
     int threads;     // number of active threads required
     int X[4];     // grid dimensions
     Gauge dataOr;
     complex<Float> *delta;

     GaugeFixQualityArg(const Gauge &dataOr, const cudaGaugeField &data, complex<Float> * delta)
       : ReduceArg<double2>(), dataOr(dataOr), delta(delta) {
       for ( int dir = 0; dir < 4; ++dir ) X[dir] = data.X()[dir];
       threads = data.VolumeCB();
     }
     double getAction(){ return result_h[0].x; }
     double getTheta(){ return result_h[0].y; }
   };


   template<int blockSize, int Elems, typename Float, typename Gauge, int gauge_dir>
   __global__ void computeFix_quality(GaugeFixQualityArg<Float, Gauge> argQ){
     int idx = threadIdx.x + blockIdx.x * blockDim.x;
     int parity = threadIdx.y;

     double2 data = make_double2(0.0,0.0);
     if ( idx < argQ.threads ) {
       typedef complex<Float> Cmplx;

       int x[4];
       getCoords(x, idx, argQ.X, parity);
       Matrix<Cmplx,3> delta;
       setZero(&delta);
       //idx = linkIndex(x,X);
       for ( int mu = 0; mu < gauge_dir; mu++ ) {
         Matrix<Cmplx,3> U;
         argQ.dataOr.load((Float*)(U.data),idx, mu, parity);
         delta -= U;
       }
       //18*gauge_dir
       data.x = -delta(0,0).x - delta(1,1).x - delta(2,2).x;
       //2
       for ( int mu = 0; mu < gauge_dir; mu++ ) {
         Matrix<Cmplx,3> U;
         argQ.dataOr.load((Float*)(U.data),linkIndexM1(x,argQ.X,mu), mu, 1 - parity);
         delta += U;
       }
       //18*gauge_dir
       delta -= conj(delta);
       //18
       //SAVE DELTA!!!!!
       SubTraceUnit(delta);
       idx = getIndexFull(idx, argQ.X, parity);
       //Saving Delta
       argQ.delta[idx] = delta(0,0);
       argQ.delta[idx + 2 * argQ.threads] = delta(0,1);
       argQ.delta[idx + 4 * argQ.threads] = delta(0,2);
       argQ.delta[idx + 6 * argQ.threads] = delta(1,1);
       argQ.delta[idx + 8 * argQ.threads] = delta(1,2);
       argQ.delta[idx + 10 * argQ.threads] = delta(2,2);
       //12
       data.y = getRealTraceUVdagger(delta, delta);
       //35
       //T=36*gauge_dir+65
     }

     reduce2d<blockSize,2>(argQ, data);
   }


   template<int Elems, typename Float, typename Gauge, int gauge_dir>
   class GaugeFixQuality : TunableLocalParity {
     GaugeFixQualityArg<Float, Gauge> argQ;
     mutable char aux_string[128];     // used as a label in the autotuner
     private:

     unsigned int minThreads() const { return argQ.threads; }

     public:
     GaugeFixQuality(GaugeFixQualityArg<Float, Gauge> &argQ)
       : argQ(argQ) {
     }
     ~GaugeFixQuality () { }

     void apply(const cudaStream_t &stream){
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
       argQ.result_h[0] = make_double2(0.0,0.0);
       LAUNCH_KERNEL_LOCAL_PARITY(computeFix_quality, tp, stream, argQ, Elems, Float, Gauge, gauge_dir);
       qudaDeviceSynchronize();
       argQ.result_h[0].x  /= (double)(3 * gauge_dir * 2 * argQ.threads);
       argQ.result_h[0].y  /= (double)(3 * 2 * argQ.threads);
     }

     TuneKey tuneKey() const {
       std::stringstream vol;
       vol << argQ.X[0] << "x" << argQ.X[1] << "x" << argQ.X[2] << "x" << argQ.X[3];
       sprintf(aux_string,"threads=%d,prec=%lu,gaugedir=%d", argQ.threads, sizeof(Float), gauge_dir);
       return TuneKey(vol.str().c_str(), typeid(*this).name(), aux_string);
     }

     long long flops() const {
       return (36LL * gauge_dir + 65LL) * 2 * argQ.threads;
     }                                                                         // Only correct if there is no link reconstruction, no cub reduction accounted also
     long long bytes() const {
       return (2LL * gauge_dir + 2LL) * Elems * 2 * argQ.threads * sizeof(Float);
     }                                                                                                    //Not accounting the reduction!!!

   };


   template <typename Float>
   struct GaugeFixArg {
     int threads;     // number of active threads required
     int X[4];     // grid dimensions
     cudaGaugeField &data;
     Float *invpsq;
     complex<Float> *delta;
     complex<Float> *gx;

     GaugeFixArg( cudaGaugeField & data, const int Elems) : data(data){
       for ( int dir = 0; dir < 4; ++dir ) X[dir] = data.X()[dir];
       threads = X[0] * X[1] * X[2] * X[3];
       invpsq = (Float*)device_malloc(sizeof(Float) * threads);
       delta = (complex<Float>*)device_malloc(sizeof(complex<Float>) * threads * 6);
 #ifdef GAUGEFIXING_DONT_USE_GX
       gx = (complex<Float>*)device_malloc(sizeof(complex<Float>) * threads);
 #else
       gx = (complex<Float>*)device_malloc(sizeof(complex<Float>) * threads * Elems);
 #endif
       BindTex(delta, gx, sizeof(complex<Float>) * threads * Elems);
     }
     void free(){
       UnBindTex(delta, gx);
       device_free(invpsq);
       device_free(delta);
       device_free(gx);
     }
   };


   template <typename Float>
   __global__ void kernel_gauge_set_invpsq(GaugeFixArg<Float> arg){
     int id = blockIdx.x * blockDim.x + threadIdx.x;
     if ( id >= arg.threads ) return;
     int x1 = id / (arg.X[2] * arg.X[3] * arg.X[0]);
     int x0 = (id / (arg.X[2] * arg.X[3])) % arg.X[0];
     int x3 = (id / arg.X[2]) % arg.X[3];
     int x2 = id % arg.X[2];
     //id  =  x2 + (x3 +  (x0 + x1 * arg.X[0]) * arg.X[3]) * arg.X[2];
     Float sx = sin( (Float)x0 * FL_UNITARIZE_PI / (Float)arg.X[0]);
     Float sy = sin( (Float)x1 * FL_UNITARIZE_PI / (Float)arg.X[1]);
     Float sz = sin( (Float)x2 * FL_UNITARIZE_PI / (Float)arg.X[2]);
     Float st = sin( (Float)x3 * FL_UNITARIZE_PI / (Float)arg.X[3]);
     Float sinsq = sx * sx + sy * sy + sz * sz + st * st;
     Float prcfact = 0.0;
     //The FFT normalization is done here
     if ( sinsq > 0.00001 ) prcfact = 4.0 / (sinsq * (Float)arg.threads);
     arg.invpsq[id] = prcfact;
   }


   template<typename Float>
   class GaugeFixSETINVPSP : Tunable {
     GaugeFixArg<Float> arg;
     mutable char aux_string[128];     // used as a label in the autotuner
     private:
     unsigned int sharedBytesPerThread() const {
       return 0;
     }
     unsigned int sharedBytesPerBlock(const TuneParam &param) const {
       return 0;
     }
     bool tuneSharedBytes() const {
       return false;
     }                                                  // Don't tune shared memory
     bool tuneGridDim() const {
       return false;
     }                                              // Don't tune the grid dimensions.
     unsigned int minThreads() const {
       return arg.threads;
     }

     public:
     GaugeFixSETINVPSP(GaugeFixArg<Float> &arg) : arg(arg) { }
     ~GaugeFixSETINVPSP () { }

     void apply(const cudaStream_t &stream){
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
       kernel_gauge_set_invpsq<Float><< < tp.grid, tp.block, 0, stream >> > (arg);
     }

     TuneKey tuneKey() const {
       std::stringstream vol;
       vol << arg.X[0] << "x";
       vol << arg.X[1] << "x";
       vol << arg.X[2] << "x";
       vol << arg.X[3];
       sprintf(aux_string,"threads=%d,prec=%lu", arg.threads, sizeof(Float));
       return TuneKey(vol.str().c_str(), typeid(*this).name(), aux_string);

     }

     long long flops() const {
       return 21 * arg.threads;
     }
     long long bytes() const {
       return sizeof(Float) * arg.threads;
     }

   };

   template<typename Float>
   __global__ void kernel_gauge_mult_norm_2D(GaugeFixArg<Float> arg){
     int id = blockIdx.x * blockDim.x + threadIdx.x;
     if ( id < arg.threads ) arg.gx[id] = arg.gx[id] * arg.invpsq[id];
   }


   template<typename Float>
   class GaugeFixINVPSP : Tunable {
     GaugeFixArg<Float> arg;
     mutable char aux_string[128];     // used as a label in the autotuner
     private:
     unsigned int sharedBytesPerThread() const {
       return 0;
     }
     unsigned int sharedBytesPerBlock(const TuneParam &param) const {
       return 0;
     }
     //bool tuneSharedBytes() const { return false; } // Don't tune shared memory
     bool tuneGridDim() const {
       return false;
     }                                              // Don't tune the grid dimensions.
     unsigned int minThreads() const {
       return arg.threads;
     }

     public:
     GaugeFixINVPSP(GaugeFixArg<Float> &arg)
       : arg(arg){
       cudaFuncSetCacheConfig( kernel_gauge_mult_norm_2D<Float>,   cudaFuncCachePreferL1);
     }
     ~GaugeFixINVPSP () {
     }

     void apply(const cudaStream_t &stream){
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
       kernel_gauge_mult_norm_2D<Float><< < tp.grid, tp.block, 0, stream >> > (arg);
     }

     TuneKey tuneKey() const {
       std::stringstream vol;
       vol << arg.X[0] << "x";
       vol << arg.X[1] << "x";
       vol << arg.X[2] << "x";
       vol << arg.X[3];
       sprintf(aux_string,"threads=%d,prec=%lu", arg.threads, sizeof(Float));
       return TuneKey(vol.str().c_str(), typeid(*this).name(), aux_string);

     }

     void preTune(){
       //since delta contents are irrelevant at this point, we can swap gx with delta
       complex<Float> *tmp = arg.gx;
       arg.gx = arg.delta;
       arg.delta = tmp;
     }
     void postTune(){
       arg.gx = arg.delta;
     }
     long long flops() const {
       return 2LL * arg.threads;
     }
     long long bytes() const {
       return 5LL * sizeof(Float) * arg.threads;
     }

   };


   template <typename Float>
   __host__ __device__ inline void reunit_link( Matrix<complex<Float>,3> &U ){

     complex<Float> t2((Float)0.0, (Float)0.0);
     Float t1 = 0.0;
     //first normalize first row
     //sum of squares of row
 #pragma unroll
     for ( int c = 0; c < 3; c++ ) t1 += norm(U(0,c));
     t1 = (Float)1.0 / sqrt(t1);
     //14
     //used to normalize row
 #pragma unroll
     for ( int c = 0; c < 3; c++ ) U(0,c) *= t1;
     //6
 #pragma unroll
     for ( int c = 0; c < 3; c++ ) t2 += conj(U(0,c)) * U(1,c);
     //24
 #pragma unroll
     for ( int c = 0; c < 3; c++ ) U(1,c) -= t2 * U(0,c);
     //24
     //normalize second row
     //sum of squares of row
     t1 = 0.0;
 #pragma unroll
     for ( int c = 0; c < 3; c++ ) t1 += norm(U(1,c));
     t1 = (Float)1.0 / sqrt(t1);
     //14
     //used to normalize row
 #pragma unroll
     for ( int c = 0; c < 3; c++ ) U(1, c) *= t1;
     //6
     //Reconstruct lat row
     U(2,0) = conj(U(0,1) * U(1,2) - U(0,2) * U(1,1));
     U(2,1) = conj(U(0,2) * U(1,0) - U(0,0) * U(1,2));
     U(2,2) = conj(U(0,0) * U(1,1) - U(0,1) * U(1,0));
     //42
     //T=130
   }

 #ifdef GAUGEFIXING_DONT_USE_GX

   template <typename Float, typename Gauge>
   __global__ void kernel_gauge_fix_U_EO_NEW( GaugeFixArg<Float> arg, Gauge dataOr, Float half_alpha){
     int id = threadIdx.x + blockIdx.x * blockDim.x;
     int parity = threadIdx.y;

     if ( id >= arg.threads/2 ) return;

     typedef complex<Float> Cmplx;

     int x[4];
     getCoords(x, id, arg.X, parity);
     int idx = ((x[3] * arg.X[2] + x[2]) * arg.X[1] + x[1]) * arg.X[0] + x[0];
     Matrix<Cmplx,3> de;
     //Read Delta
 #ifdef GAUGEFIXING_SITE_MATRIX_LOAD_TEX
     de(0,0) = TEXTURE_DELTA<Cmplx>(idx + 0 * arg.threads);
     de(0,1) = TEXTURE_DELTA<Cmplx>(idx + 1 * arg.threads);
     de(0,2) = TEXTURE_DELTA<Cmplx>(idx + 2 * arg.threads);
     de(1,1) = TEXTURE_DELTA<Cmplx>(idx + 3 * arg.threads);
     de(1,2) = TEXTURE_DELTA<Cmplx>(idx + 4 * arg.threads);
     de(2,2) = TEXTURE_DELTA<Cmplx>(idx + 5 * arg.threads);
 #else
     de(0,0) = arg.delta[idx + 0 * arg.threads];
     de(0,1) = arg.delta[idx + 1 * arg.threads];
     de(0,2) = arg.delta[idx + 2 * arg.threads];
     de(1,1) = arg.delta[idx + 3 * arg.threads];
     de(1,2) = arg.delta[idx + 4 * arg.threads];
     de(2,2) = arg.delta[idx + 5 * arg.threads];
 #endif
     de(1,0) = Cmplx(-de(0,1).x, de(0,1).y);
     de(2,0) = Cmplx(-de(0,2).x, de(0,2).y);
     de(2,1) = Cmplx(-de(1,2).x, de(1,2).y);
     Matrix<Cmplx,3> g;
     setIdentity(&g);
     g += de * half_alpha;
     //36
     reunit_link<Float>( g );
     //130


     for ( int mu = 0; mu < 4; mu++ ) {
       Matrix<Cmplx,3> U;
       Matrix<Cmplx,3> g0;
       dataOr.load((Float*)(U.data),id, mu, parity);
       U = g * U;
       //198
       idx = linkNormalIndexP1(x,arg.X,mu);
       //Read Delta
 #ifdef GAUGEFIXING_SITE_MATRIX_LOAD_TEX
       de(0,0) = TEXTURE_DELTA<Cmplx>(idx + 0 * arg.threads);
       de(0,1) = TEXTURE_DELTA<Cmplx>(idx + 1 * arg.threads);
       de(0,2) = TEXTURE_DELTA<Cmplx>(idx + 2 * arg.threads);
       de(1,1) = TEXTURE_DELTA<Cmplx>(idx + 3 * arg.threads);
       de(1,2) = TEXTURE_DELTA<Cmplx>(idx + 4 * arg.threads);
       de(2,2) = TEXTURE_DELTA<Cmplx>(idx + 5 * arg.threads);
 #else
       de(0,0) = arg.delta[idx + 0 * arg.threads];
       de(0,1) = arg.delta[idx + 1 * arg.threads];
       de(0,2) = arg.delta[idx + 2 * arg.threads];
       de(1,1) = arg.delta[idx + 3 * arg.threads];
       de(1,2) = arg.delta[idx + 4 * arg.threads];
       de(2,2) = arg.delta[idx + 5 * arg.threads];
 #endif
       de(1,0) = Cmplx(-de(0,1).x, de(0,1).y);
       de(2,0) = Cmplx(-de(0,2).x, de(0,2).y);
       de(2,1) = Cmplx(-de(1,2).x, de(1,2).y);

       setIdentity(&g0);
       g0 += de * half_alpha;
       //36
       reunit_link<Float>( g0 );
       //130

       U = U * conj(g0);
       //198
       dataOr.save((Float*)(U.data),id, mu, parity);
     }
   }


   template<typename Float, typename Gauge>
   class GaugeFixNEW : TunableLocalParity {
     GaugeFixArg<Float> arg;
     Float half_alpha;
     Gauge dataOr;
     mutable char aux_string[128];     // used as a label in the autotuner
     private:

     // since GaugeFixArg is used by other kernels that don't use
     // tunableLocalParity, arg.threads stores Volume and not VolumeCB
     // so we need to divide by two
     unsigned int minThreads() const { return arg.threads/2; }

     public:
     GaugeFixNEW(Gauge & dataOr, GaugeFixArg<Float> &arg, Float alpha)
       : dataOr(dataOr), arg(arg) {
       half_alpha = alpha * 0.5;
       cudaFuncSetCacheConfig( kernel_gauge_fix_U_EO_NEW<Float, Gauge>,   cudaFuncCachePreferL1);
     }
     ~GaugeFixNEW () { }

     void setAlpha(Float alpha){ half_alpha = alpha * 0.5; }

     void apply(const cudaStream_t &stream){
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
       kernel_gauge_fix_U_EO_NEW<Float, Gauge><< < tp.grid, tp.block, 0, stream >> > (arg, dataOr, half_alpha);
     }

     TuneKey tuneKey() const {
       std::stringstream vol;
       vol << arg.X[0] << "x" << arg.X[1] << "x" << arg.X[2] << "x" << arg.X[3];
       sprintf(aux_string,"threads=%d,prec=%lu", arg.threads, sizeof(Float));
       return TuneKey(vol.str().c_str(), typeid(*this).name(), aux_string);

     }

     //need this
     void preTune() {
       arg.data.backup();
     }
     void postTune() {
       arg.data.restore();
     }
     long long flops() const {
       return 2414LL * arg.threads;
       //Not accounting here the reconstruction of the gauge if 12 or 8!!!!!!
     }
     long long bytes() const {
       return ( dataOr.Bytes() * 4LL + 5 * 12LL * sizeof(Float)) * arg.threads;
     }

   };


 #else
   template <int Elems, typename Float>
   __global__ void kernel_gauge_GX(GaugeFixArg<Float> arg, Float half_alpha){

     int id = blockIdx.x * blockDim.x + threadIdx.x;

     if ( id >= arg.threads ) return;

     typedef complex<Float> Cmplx;

     Matrix<Cmplx,3> de;
     //Read Delta
         #ifdef GAUGEFIXING_SITE_MATRIX_LOAD_TEX
     de(0,0) = TEXTURE_DELTA<Cmplx>(id);
     de(0,1) = TEXTURE_DELTA<Cmplx>(id + arg.threads);
     de(0,2) = TEXTURE_DELTA<Cmplx>(id + 2 * arg.threads);
     de(1,1) = TEXTURE_DELTA<Cmplx>(id + 3 * arg.threads);
     de(1,2) = TEXTURE_DELTA<Cmplx>(id + 4 * arg.threads);
     de(2,2) = TEXTURE_DELTA<Cmplx>(id + 5 * arg.threads);
         #else
     de(0,0) = arg.delta[id];
     de(0,1) = arg.delta[id + arg.threads];
     de(0,2) = arg.delta[id + 2 * arg.threads];
     de(1,1) = arg.delta[id + 3 * arg.threads];
     de(1,2) = arg.delta[id + 4 * arg.threads];
     de(2,2) = arg.delta[id + 5 * arg.threads];
         #endif
     de(1,0) = makeComplex(-de(0,1).x, de(0,1).y);
     de(2,0) = makeComplex(-de(0,2).x, de(0,2).y);
     de(2,1) = makeComplex(-de(1,2).x, de(1,2).y);


     Matrix<Cmplx,3> g;
     setIdentity(&g);
     g += de * half_alpha;
     //36
     reunit_link<Float>( g );
     //130
     //gx is represented in even/odd order
     //normal lattice index to even/odd index
     int x3 = id / (arg.X[0] * arg.X[1] * arg.X[2]);
     int x2 = (id / (arg.X[0] * arg.X[1])) % arg.X[2];
     int x1 = (id / arg.X[0]) % arg.X[1];
     int x0 = id % arg.X[0];
     id  =  (x0 + (x1 + (x2 + x3 * arg.X[2]) * arg.X[1]) * arg.X[0]) >> 1;
     id += ((x0 + x1 + x2 + x3) & 1 ) * arg.threads / 2;

     for ( int i = 0; i < Elems; i++ ) arg.gx[id + i * arg.threads] = g.data[i];
     //T=166 for Elems 9
     //T=208 for Elems 6
   }


   template<int Elems, typename Float>
   class GaugeFix_GX : Tunable {
     GaugeFixArg<Float> arg;
     Float half_alpha;
     mutable char aux_string[128];     // used as a label in the autotuner
     private:
     unsigned int sharedBytesPerThread() const {
       return 0;
     }
     unsigned int sharedBytesPerBlock(const TuneParam &param) const {
       return 0;
     }
     //bool tuneSharedBytes() const { return false; } // Don't tune shared memory
     bool tuneGridDim() const {
       return false;
     }                                              // Don't tune the grid dimensions.
     unsigned int minThreads() const {
       return arg.threads;
     }

     public:
     GaugeFix_GX(GaugeFixArg<Float> &arg, Float alpha)
       : arg(arg) {
       half_alpha = alpha * 0.5;
       cudaFuncSetCacheConfig( kernel_gauge_GX<Elems, Float>,   cudaFuncCachePreferL1);
     }
     ~GaugeFix_GX () {
     }

     void setAlpha(Float alpha){
       half_alpha = alpha * 0.5;
     }


     void apply(const cudaStream_t &stream){
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
       kernel_gauge_GX<Elems, Float><< < tp.grid, tp.block, 0, stream >> > (arg, half_alpha);
     }

     TuneKey tuneKey() const {
       std::stringstream vol;
       vol << arg.X[0] << "x";
       vol << arg.X[1] << "x";
       vol << arg.X[2] << "x";
       vol << arg.X[3];
       sprintf(aux_string,"threads=%d,prec=%lu", arg.threads, sizeof(Float));
       return TuneKey(vol.str().c_str(), typeid(*this).name(), aux_string);

     }

     long long flops() const {
       if ( Elems == 6 ) return 208LL * arg.threads;
       else return 166LL * arg.threads;
     }
     long long bytes() const {
       return 4LL * Elems * sizeof(Float) * arg.threads;
     }

   };


   template <int Elems, typename Float, typename Gauge>
   __global__ void kernel_gauge_fix_U_EO( GaugeFixArg<Float> arg, Gauge dataOr){
     int idd = threadIdx.x + blockIdx.x * blockDim.x;

     if ( idd >= arg.threads ) return;

     int parity = 0;
     int id = idd;
     if ( idd >= arg.threads / 2 ) {
       parity = 1;
       id -= arg.threads / 2;
     }
     typedef complex<Float> Cmplx;

     Matrix<Cmplx,3> g;
     //for(int i = 0; i < Elems; i++) g.data[i] = arg.gx[idd + i * arg.threads];
     for ( int i = 0; i < Elems; i++ ) {
                 #ifdef GAUGEFIXING_SITE_MATRIX_LOAD_TEX
       g.data[i] = TEXTURE_GX<Cmplx>(idd + i * arg.threads);
                 #else
       g.data[i] = arg.gx[idd + i * arg.threads];
                 #endif
     }
     if ( Elems == 6 ) {
       g(2,0) = conj(g(0,1) * g(1,2) - g(0,2) * g(1,1));
       g(2,1) = conj(g(0,2) * g(1,0) - g(0,0) * g(1,2));
       g(2,2) = conj(g(0,0) * g(1,1) - g(0,1) * g(1,0));
       //42
     }
     int x[4];
     getCoords(x, id, arg.X, parity);
     for ( int mu = 0; mu < 4; mu++ ) {
       Matrix<Cmplx,3> U;
       Matrix<Cmplx,3> g0;
       dataOr.load((Float*)(U.data),id, mu, parity);
       U = g * U;
       //198
       int idm1 = linkIndexP1(x,arg.X,mu);
       idm1 += (1 - parity) * arg.threads / 2;
       //for(int i = 0; i < Elems; i++) g0.data[i] = arg.gx[idm1 + i * arg.threads];
       for ( int i = 0; i < Elems; i++ ) {
                         #ifdef GAUGEFIXING_SITE_MATRIX_LOAD_TEX
         g0.data[i] = TEXTURE_GX<Cmplx>(idm1 + i * arg.threads);
                         #else
         g0.data[i] = arg.gx[idm1 + i * arg.threads];
                         #endif
       }
       if ( Elems == 6 ) {
         g0(2,0) = conj(g0(0,1) * g0(1,2) - g0(0,2) * g0(1,1));
         g0(2,1) = conj(g0(0,2) * g0(1,0) - g0(0,0) * g0(1,2));
         g0(2,2) = conj(g0(0,0) * g0(1,1) - g0(0,1) * g0(1,0));
         //42
       }
       U = U * conj(g0);
       //198
       dataOr.save((Float*)(U.data),id, mu, parity);
     }
     //T=42+4*(198*2+42) Elems=6
     //T=4*(198*2) Elems=9
     //Not accounting here the reconstruction of the gauge if 12 or 8!!!!!!
   }


   template<int Elems, typename Float, typename Gauge>
   class GaugeFix : Tunable {
     GaugeFixArg<Float> arg;
     Gauge dataOr;
     mutable char aux_string[128];     // used as a label in the autotuner
     private:
     unsigned int sharedBytesPerThread() const {
       return 0;
     }
     unsigned int sharedBytesPerBlock(const TuneParam &param) const {
       return 0;
     }
     //bool tuneSharedBytes() const { return false; } // Don't tune shared memory
     bool tuneGridDim() const {
       return false;
     }                                              // Don't tune the grid dimensions.
     unsigned int minThreads() const {
       return arg.threads;
     }

     public:
     GaugeFix(Gauge & dataOr, GaugeFixArg<Float> &arg)
       : dataOr(dataOr), arg(arg) {
       cudaFuncSetCacheConfig( kernel_gauge_fix_U_EO<Elems, Float, Gauge>,   cudaFuncCachePreferL1);
     }
     ~GaugeFix () { }


     void apply(const cudaStream_t &stream){
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
       kernel_gauge_fix_U_EO<Elems, Float, Gauge><< < tp.grid, tp.block, 0, stream >> > (arg, dataOr);
     }

     TuneKey tuneKey() const {
       std::stringstream vol;
       vol << arg.X[0] << "x";
       vol << arg.X[1] << "x";
       vol << arg.X[2] << "x";
       vol << arg.X[3];
       sprintf(aux_string,"threads=%d,prec=%lu", arg.threads, sizeof(Float));
       return TuneKey(vol.str().c_str(), typeid(*this).name(), aux_string);

     }

     //need this
     void preTune() {
       arg.data.backup();
     }
     void postTune() {
       arg.data.restore();
     }
     long long flops() const {
       if ( Elems == 6 ) return 1794LL * arg.threads;
       else return 1536LL * arg.threads;
       //Not accounting here the reconstruction of the gauge if 12 or 8!!!!!!
     }
     long long bytes() const {
       return 26LL * Elems * sizeof(Float) * arg.threads;
     }

   };
 #endif
 //GAUGEFIXING_DONT_USE_GX


   template<int Elems, typename Float, typename Gauge, int gauge_dir>
   void gaugefixingFFT( Gauge dataOr,  cudaGaugeField& data, \
                        const int Nsteps, const int verbose_interval, \
                        const Float alpha0, const int autotune, const double tolerance, \
                        const int stopWtheta) {

     TimeProfile profileInternalGaugeFixFFT("InternalGaugeFixQudaFFT", false);

     profileInternalGaugeFixFFT.TPSTART(QUDA_PROFILE_COMPUTE);

     Float alpha = alpha0;
     std::cout << "\tAlpha parameter of the Steepest Descent Method: " << alpha << std::endl;
     if ( autotune ) std::cout << "\tAuto tune active: yes" << std::endl;
     else std::cout << "\tAuto tune active: no" << std::endl;
     std::cout << "\tStop criterium: " << tolerance << std::endl;
     if ( stopWtheta ) std::cout << "\tStop criterium method: theta" << std::endl;
     else std::cout << "\tStop criterium method: Delta" << std::endl;
     std::cout << "\tMaximum number of iterations: " << Nsteps << std::endl;
     std::cout << "\tPrint convergence results at every " << verbose_interval << " steps" << std::endl;


     unsigned int delta_pad = data.X()[0] * data.X()[1] * data.X()[2] * data.X()[3];
     int4 size = make_int4( data.X()[0], data.X()[1], data.X()[2], data.X()[3] );
     cufftHandle plan_xy;
     cufftHandle plan_zt;

     GaugeFixArg<Float> arg(data, Elems);
     SetPlanFFT2DMany( plan_zt, size, 0, arg.delta);     //for space and time ZT
     SetPlanFFT2DMany( plan_xy, size, 1, arg.delta);    //with space only XY


     GaugeFixFFTRotateArg<Float> arg_rotate(data);
     GaugeFixFFTRotate<Float> GFRotate(arg_rotate);

     GaugeFixSETINVPSP<Float> setinvpsp(arg);
     setinvpsp.apply(0);
     GaugeFixINVPSP<Float> invpsp(arg);


 #ifdef GAUGEFIXING_DONT_USE_GX
     //without using GX, gx will be created only for plane rotation but with less size
     GaugeFixNEW<Float, Gauge> gfixNew(dataOr, arg, alpha);
 #else
     //using GX
     GaugeFix_GX<Elems, Float> calcGX(arg, alpha);
     GaugeFix<Elems, Float, Gauge> gfix(dataOr, arg);
 #endif

     GaugeFixQualityArg<Float, Gauge> argQ(dataOr, data, arg.delta);
     GaugeFixQuality<Elems, Float, Gauge, gauge_dir> gfixquality(argQ);

     gfixquality.apply(0);
     double action0 = argQ.getAction();
     printf("Step: %d\tAction: %.16e\ttheta: %.16e\n", 0, argQ.getAction(), argQ.getTheta());

     double diff = 0.0;
     int iter = 0;
     for ( iter = 0; iter < Nsteps; iter++ ) {
       for ( int k = 0; k < 6; k++ ) {
         //------------------------------------------------------------------------
         // Set a pointer do the element k in lattice volume
         // each element is stored with stride lattice volume
         // it uses gx as temporary array!!!!!!
         //------------------------------------------------------------------------
         complex<Float> *_array = arg.delta + k * delta_pad;
         //------------------------------------------------------------------------
         // Perform FFT on xy plane
         //------------------------------------------------------------------------
         ApplyFFT(plan_xy, _array, arg.gx, CUFFT_FORWARD);
         //------------------------------------------------------------------------
         // Rotate hypercube, xyzt -> ztxy
         //------------------------------------------------------------------------
         GFRotate.setDirection(0, arg.gx, _array);
         GFRotate.apply(0);
         //------------------------------------------------------------------------
         // Perform FFT on zt plane
         //------------------------------------------------------------------------
         ApplyFFT(plan_zt, _array, arg.gx, CUFFT_FORWARD);
         //------------------------------------------------------------------------
         // Normalize FFT and apply pmax^2/p^2
         //------------------------------------------------------------------------
         invpsp.apply(0);
         //------------------------------------------------------------------------
         // Perform IFFT on zt plane
         //------------------------------------------------------------------------
         ApplyFFT(plan_zt, arg.gx, _array, CUFFT_INVERSE);
         //------------------------------------------------------------------------
         // Rotate hypercube, ztxy -> xyzt
         //------------------------------------------------------------------------
         GFRotate.setDirection(1, _array, arg.gx);
         GFRotate.apply(0);
         //------------------------------------------------------------------------
         // Perform IFFT on xy plane
         //------------------------------------------------------------------------
         ApplyFFT(plan_xy, arg.gx, _array, CUFFT_INVERSE);
       }
                 #ifdef GAUGEFIXING_DONT_USE_GX
       //------------------------------------------------------------------------
       // Apply gauge fix to current gauge field
       //------------------------------------------------------------------------
       gfixNew.apply(0);
                 #else
       //------------------------------------------------------------------------
       // Calculate g(x)
       //------------------------------------------------------------------------
       calcGX.apply(0);
       //------------------------------------------------------------------------
       // Apply gauge fix to current gauge field
       //------------------------------------------------------------------------
       gfix.apply(0);
                 #endif
       //------------------------------------------------------------------------
       // Measure gauge quality and recalculate new Delta(x)
       //------------------------------------------------------------------------
       gfixquality.apply(0);
       double action = argQ.getAction();
       diff = abs(action0 - action);
       if ((iter % verbose_interval) == (verbose_interval - 1))
         printf("Step: %d\tAction: %.16e\ttheta: %.16e\tDelta: %.16e\n", iter + 1, argQ.getAction(), argQ.getTheta(), diff);
       if ( autotune && ((action - action0) < -1e-14) ) {
         if ( alpha > 0.01 ) {
           alpha = 0.95 * alpha;
                                 #ifdef GAUGEFIXING_DONT_USE_GX
           gfixNew.setAlpha(alpha);
                                 #else
           calcGX.setAlpha(alpha);
                                 #endif
           printf(">>>>>>>>>>>>>> Warning: changing alpha down -> %.4e\n", alpha );
         }
       }
       //------------------------------------------------------------------------
       // Check gauge fix quality criterium
       //------------------------------------------------------------------------
       if ( stopWtheta ) {   if ( argQ.getTheta() < tolerance ) break; }
       else { if ( diff < tolerance ) break; }

       action0 = action;
     }
     if ((iter % verbose_interval) != 0 )
       printf("Step: %d\tAction: %.16e\ttheta: %.16e\tDelta: %.16e\n", iter, argQ.getAction(), argQ.getTheta(), diff);

     // Reunitarize at end
     const double unitarize_eps = 1e-14;
     const double max_error = 1e-10;
     const int reunit_allow_svd = 1;
     const int reunit_svd_only  = 0;
     const double svd_rel_error = 1e-6;
     const double svd_abs_error = 1e-6;
     setUnitarizeLinksConstants(unitarize_eps, max_error,
                                reunit_allow_svd, reunit_svd_only,
                                svd_rel_error, svd_abs_error);
     int num_failures = 0;
     int* num_failures_dev = static_cast<int*>(pool_device_malloc(sizeof(int)));
     cudaMemset(num_failures_dev, 0, sizeof(int));
     unitarizeLinks(data, data, num_failures_dev);
     qudaMemcpy(&num_failures, num_failures_dev, sizeof(int), cudaMemcpyDeviceToHost);

     pool_device_free(num_failures_dev);
     if ( num_failures > 0 ) {
       errorQuda("Error in the unitarization\n");
       exit(1);
     }
     // end reunitarize


     arg.free();
     CUFFT_SAFE_CALL(cufftDestroy(plan_zt));
     CUFFT_SAFE_CALL(cufftDestroy(plan_xy));
     checkCudaError();
     qudaDeviceSynchronize();
     profileInternalGaugeFixFFT.TPSTOP(QUDA_PROFILE_COMPUTE);

     if (getVerbosity() > QUDA_SUMMARIZE){
       double secs = profileInternalGaugeFixFFT.Last(QUDA_PROFILE_COMPUTE);
       double fftflop = 5.0 * (log2((double)( data.X()[0] * data.X()[1]) ) + log2( (double)(data.X()[2] * data.X()[3] )));
       fftflop *= (double)( data.X()[0] * data.X()[1] * data.X()[2] * data.X()[3] );
       double gflops = setinvpsp.flops() + gfixquality.flops();
       double gbytes = setinvpsp.bytes() + gfixquality.bytes();
       double flop = invpsp.flops() * Elems;
       double byte = invpsp.bytes() * Elems;
       flop += (GFRotate.flops() + fftflop) * Elems * 2;
       byte += GFRotate.bytes() * Elems * 4;     //includes FFT reads, assuming 1 read and 1 write per site
       #ifdef GAUGEFIXING_DONT_USE_GX
       flop += gfixNew.flops();
       byte += gfixNew.bytes();
       #else
       flop += calcGX.flops();
       byte += calcGX.bytes();
       flop += gfix.flops();
       byte += gfix.bytes();
       #endif
       flop += gfixquality.flops();
       byte += gfixquality.bytes();
       gflops += flop * iter;
       gbytes += byte * iter;
       gflops += 4588.0 * data.X()[0]*data.X()[1]*data.X()[2]*data.X()[3]; //Reunitarize at end
       gbytes += 8.0 * data.X()[0]*data.X()[1]*data.X()[2]*data.X()[3] * dataOr.Bytes() ; //Reunitarize at end

       gflops = (gflops * 1e-9) / (secs);
       gbytes = gbytes / (secs * 1e9);
       printfQuda("Time: %6.6f s, Gflop/s = %6.1f, GB/s = %6.1f\n", secs, gflops, gbytes);
     }
   }

   template<int Elems, typename Float, typename Gauge>
   void gaugefixingFFT( Gauge dataOr,  cudaGaugeField& data, const int gauge_dir, \
                        const int Nsteps, const int verbose_interval, const Float alpha, const int autotune, \
                        const double tolerance, const int stopWtheta) {
     if ( gauge_dir != 3 ) {
       printf("Starting Landau gauge fixing with FFTs...\n");
       gaugefixingFFT<Elems, Float, Gauge, 4>(dataOr, data, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);
     }
     else {
       printf("Starting Coulomb gauge fixing with FFTs...\n");
       gaugefixingFFT<Elems, Float, Gauge, 3>(dataOr, data, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);
     }
   }


   template<typename Float>
   void gaugefixingFFT( cudaGaugeField& data, const int gauge_dir, \
                        const int Nsteps, const int verbose_interval, const Float alpha, const int autotune, \
                        const double tolerance, const int stopWtheta) {

     // Switching to FloatNOrder for the gauge field in order to support RECONSTRUCT_12
     // Need to fix this!!
     //9 and 6 means the number of complex elements used to store g(x) and Delta(x)
     if ( data.isNative() ) {
       if ( data.Reconstruct() == QUDA_RECONSTRUCT_NO ) {
         //printfQuda("QUDA_RECONSTRUCT_NO\n");
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_NO>::type Gauge;
         gaugefixingFFT<9, Float>(Gauge(data), data, gauge_dir, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);
       } else if ( data.Reconstruct() == QUDA_RECONSTRUCT_12 ) {
         //printfQuda("QUDA_RECONSTRUCT_12\n");
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_12>::type Gauge;
         gaugefixingFFT<6, Float>(Gauge(data), data, gauge_dir, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);
       } else if ( data.Reconstruct() == QUDA_RECONSTRUCT_8 ) {
         //printfQuda("QUDA_RECONSTRUCT_8\n");
   typedef typename gauge_mapper<Float,QUDA_RECONSTRUCT_8>::type Gauge;
         gaugefixingFFT<6, Float>(Gauge(data), data, gauge_dir, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);

       } else {
         errorQuda("Reconstruction type %d of gauge field not supported", data.Reconstruct());
       }
     } else {
       errorQuda("Invalid Gauge Order\n");
     }
   }

 #endif // GPU_GAUGE_ALG


   void gaugefixingFFT( cudaGaugeField& data, const int gauge_dir, \
                        const int Nsteps, const int verbose_interval, const double alpha, const int autotune, \
                        const double tolerance, const int stopWtheta) {

 #ifdef GPU_GAUGE_ALG
 #ifdef MULTI_GPU
     if(comm_dim_partitioned(0) || comm_dim_partitioned(1) || comm_dim_partitioned(2) || comm_dim_partitioned(3))
       errorQuda("Gauge Fixing with FFTs in multi-GPU support NOT implemented yet!\n");
 #endif
     if ( data.Precision() == QUDA_HALF_PRECISION ) {
       errorQuda("Half precision not supported\n");
     }
     if ( data.Precision() == QUDA_SINGLE_PRECISION ) {
       gaugefixingFFT<float> (data, gauge_dir, Nsteps, verbose_interval, (float)alpha, autotune, tolerance, stopWtheta);
     } else if ( data.Precision() == QUDA_DOUBLE_PRECISION ) {
       gaugefixingFFT<double>(data, gauge_dir, Nsteps, verbose_interval, alpha, autotune, tolerance, stopWtheta);
     } else {
       errorQuda("Precision %d not supported", data.Precision());
     }
 #else
     errorQuda("Gauge fixing has bot been built");
 #endif
   }


 }
quda::getIndexFull
static __device__ __host__ int getIndexFull(int cb_index, const I X[4], int parity)
Definition: index_helper.cuh:211

qudaMemcpy
#define qudaMemcpy(dst, src, count, kind)
Definition: quda_cuda_api.h:32

QUDA_RECONSTRUCT_NO
Definition: enum_quda.h:66

quda::TuneParam
Definition: tune_quda.h:17

blockDim
dim3 dim3 blockDim
Definition: CMakeCUDACompilerId.cpp1.ii:2471

free
void free(void *)

mu
double mu
Definition: test_util.cpp:1643

quda::setZero
__device__ __host__ void setZero(Matrix< T, N > *m)
Definition: quda_matrix.h:592

quda::TunableLocalParity
Definition: tune_quda.h:306

LAUNCH_KERNEL_LOCAL_PARITY
#define LAUNCH_KERNEL_LOCAL_PARITY(kernel, tp, stream, arg,...)
Definition: launch_kernel.cuh:135

quda::norm
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
Definition: complex_quda.h:896

getVerbosity
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20

errorQuda
#define errorQuda(...)
Definition: util_quda.h:90

quda::setUnitarizeLinksConstants
void setUnitarizeLinksConstants(double unitarize_eps, double max_error, bool allow_svd, bool svd_only, double svd_rel_error, double svd_abs_error)

num_failures_dev
int * num_failures_dev
Definition: gauge_alg_test.cpp:33

QUDA_HALF_PRECISION
Definition: enum_quda.h:59

quda::sqrt
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:105

cub_helper.cuh

SetPlanFFT2DMany
void SetPlanFFT2DMany(cufftHandle &plan, int4 size, int dim, float2 *data)
Creates a CUFFT plan supporting 4D (2D+2D) data layouts for single-precision complex-to-complex.
Definition: CUFFT_Plans.h:96

QUDA_SUMMARIZE
Definition: enum_quda.h:236

reunit_svd_only
static bool reunit_svd_only
Definition: unitarize_link_test.cpp:38

quda::stream
cudaStream_t * stream
Definition: cuda_color_spinor_field.cu:898

tmp
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44

quda::getRealTraceUVdagger
__device__ __host__ double getRealTraceUVdagger(const Matrix< T, 3 > &a, const Matrix< T, 3 > &b)
Definition: quda_matrix.h:1021

quda::complex< float >
Definition: complex_quda.h:443

svd_rel_error
static double svd_rel_error
Definition: unitarize_link_test.cpp:39

quda::TuneParam::grid
dim3 grid
Definition: tune_quda.h:21

num_failures
int num_failures
Definition: gauge_alg_test.cpp:32

quda
Definition: blas_cublas.h:6

exit
void exit(int) __attribute__((noreturn))

log2
double log2(double)

param
QudaGaugeParam param
Definition: pack_test.cpp:17

x
p x
Definition: CMakeCUDACompilerId.cpp1.ii:3011

quda::Tunable
Definition: tune_quda.h:60

delta
static unsigned int delta
Definition: CMakeCUDACompilerId.cpp1.ii:12998

quda::unitarizeLinks
void unitarizeLinks(cudaGaugeField &outfield, const cudaGaugeField &infield, int *fails)
Definition: unitarize_links_quda.cu:495

launch_kernel.cuh

QUDA_RECONSTRUCT_12
Definition: enum_quda.h:67

printf
int printf(const char *,...) __attribute__((__format__(__printf__

quda::linkIndexM1
static __device__ __host__ int linkIndexM1(const int x[], const I X[4], const int mu)
Definition: index_helper.cuh:75

quda::sin
__host__ __device__ ValueType sin(ValueType x)
Definition: complex_quda.h:40

quda::TuneParam::block
dim3 block
Definition: tune_quda.h:20

deg_tm_dslash_cuda_gen.id
def id
projector matrices ######################################################################## ...
Definition: deg_tm_dslash_cuda_gen.py:31

for
for(int s=0;s< param.dc.Ls;s++)
Definition: dw_dslash5inv_core.h:181

quda::Matrix::data
T data[N *N]
Definition: quda_matrix.h:74

pool_device_malloc
#define pool_device_malloc(size)
Definition: malloc_quda.h:113

quda::QUDA_PROFILE_COMPUTE
Definition: quda_internal.h:172

quda::tuneLaunch
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603

fused_exterior_ndeg_tm_dslash_cuda_gen.i
int i
start here
Definition: fused_exterior_ndeg_tm_dslash_cuda_gen.py:816

double
double
Definition: CMakeCUDACompilerId.cpp1.ii:8010

quda::cudaGaugeField
Definition: gauge_field.h:298

unitarize_eps
static double unitarize_eps
Definition: unitarize_link_test.cpp:36

gauge_field_order.h
Main header file for host and device accessors to GaugeFields.

tmp1
#define tmp1
Definition: tmc_core.h:15

quda::ReduceArg
Definition: cub_helper.cuh:97

quda::SubTraceUnit
__device__ __host__ void SubTraceUnit(Matrix< T, 3 > &a)
Definition: quda_matrix.h:1015

quda::qudaDeviceSynchronize
cudaError_t qudaDeviceSynchronize()
Wrapper around cudaDeviceSynchronize or cuDeviceSynchronize.
Definition: quda_cuda_api.cpp:277

CUFFT_Plans.h

tune_quda.h

quda::setIdentity
__device__ __host__ void setIdentity(Matrix< T, N > *m)
Definition: quda_matrix.h:543

X
int X
Definition: asym_wilson_clover_dslash_dagger_fermi_core.h:394

ApplyFFT
void ApplyFFT(cufftHandle &plan, float2 *data_in, float2 *data_out, int direction)
Call CUFFT to perform a single-precision complex-to-complex transform plan in the transform direction...
Definition: CUFFT_Plans.h:29

QUDA_RECONSTRUCT_8
Definition: enum_quda.h:68

reunit_allow_svd
static bool reunit_allow_svd
Definition: unitarize_link_test.cpp:37

quda::gaugefixingFFT
void gaugefixingFFT(cudaGaugeField &data, const int gauge_dir, const int Nsteps, const int verbose_interval, const double alpha, const int autotune, const double tolerance, const int stopWtheta)
Gauge fixing with Steepest descent method with FFTs with support for single GPU only.
Definition: gauge_fix_fft.cu:1202

quda_matrix.h

QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:61

unitarization_links.h

index_helper.cuh

atomic.cuh

idx
int idx
Definition: staggered_fused_exterior_dslash_core.h:355

QUDA_SINGLE_PRECISION
Definition: enum_quda.h:60

svd_abs_error
static double svd_abs_error
Definition: unitarize_link_test.cpp:40

sprintf
int sprintf(char *, const char *,...) __attribute__((__format__(__printf__

quda::gauge_mapper
Definition: gauge_field_order.h:2083

printfQuda
#define printfQuda(...)
Definition: util_quda.h:84

quda::LatticeField::VolumeCB
int VolumeCB() const
Definition: lattice_field.h:425

quda::blas::flops
unsigned long long flops
Definition: blas_quda.cu:42

size
size_t size
Definition: CMakeCUDACompilerId.cpp1.ii:2289

e
return e
Definition: CMakeCUDACompilerId.cpp1.ii:3026

quda::arg
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
Definition: complex_quda.h:880

device_malloc
#define device_malloc(size)
Definition: malloc_quda.h:52

quda::TimeProfile
Definition: quda_internal.h:232

c
const void * c
Definition: CMakeCUDACompilerId.cpp1.ii:2234

quda::GaugeField::Reconstruct
QudaReconstructType Reconstruct() const
Definition: gauge_field.h:203

quda::abs
__host__ __device__ ValueType abs(ValueType x)
Definition: complex_quda.h:110

pool_device_free
#define pool_device_free(ptr)
Definition: malloc_quda.h:114

quda::complex< double >
Definition: complex_quda.h:554

checkCudaError
#define checkCudaError()
Definition: util_quda.h:129

float
float
Definition: CMakeCUDACompilerId.cpp1.ii:12791

quda::conj
__host__ __device__ ValueType conj(ValueType x)
Definition: complex_quda.h:115

getTuning
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51

quda::LatticeField::Precision
QudaPrecision Precision() const
Definition: lattice_field.h:462

quda::linkIndexP1
static __device__ __host__ int linkIndexP1(const int x[], const I X[4], const int mu)
Definition: index_helper.cuh:111

quda::GaugeField::isNative
bool isNative() const
Definition: gauge_field.cpp:138

parity
QudaParity parity
Definition: covdev_test.cpp:53

quda::linkNormalIndexP1
static __device__ __host__ int linkNormalIndexP1(const int x[], const I X[4], const int mu)
Definition: index_helper.cuh:93

tmp0
#define tmp0
Definition: tmc_core.h:14

gauge_field.h

quda::Matrix
Definition: quda_matrix.h:68

CUFFT_SAFE_CALL
#define CUFFT_SAFE_CALL(call)
Definition: CUFFT_Plans.h:10

quda::TuneKey
Definition: tune_key.h:8

quda::blas::bytes
unsigned long long bytes
Definition: blas_quda.cu:43

comm_dim_partitioned
int comm_dim_partitioned(int dim)
Definition: comm_common.cpp:597

y
int y
Definition: CMakeCUDACompilerId.cpp1.ii:2637

quda::LatticeField::X
const int * X() const
Definition: lattice_field.h:415

quda_internal.h

device_free
#define device_free(ptr)
Definition: malloc_quda.h:57

quda::getCoords
static __device__ __host__ void getCoords(int x[], int cb_index, const I X[], int parity)
Definition: index_helper.cuh:129