v0.9.0/doc/new__half_8cu_source.html

 /*

   This is just an experiment into using polar coordinates instead of
   Cartesian coordinates for storing complex numbers.  The supposition
   is that a fixed-point form for polar coordinates will lead to a much
   more efficient use of bits leading to higher precision with 8-bit
   and 16-bit precision than otherwise possible.

 */

 #include <quda_internal.h>
 #include <register_traits.h>

 using namespace quda;

 #define MAX_USHORT 65535.0f
 inline void ucopy(float &a, const ushort &b) { a = (float)b/MAX_USHORT; }
 inline void ucopy(ushort &a, const float &b) { a = (short)(b*MAX_USHORT); }


 void old_load_half(float spinor[24], short *in, float *norm, int idx) {

   for (int i=0; i<24; i++) {
     copy(spinor[i], in[idx*24+i]);
     spinor[i] *= norm[idx];
   }

 }

 void old_save_half(float spinor[24], short *out, float *norm, int idx) {

   float max = 0.0;
   for (int i=0; i<24; i++) {
     float tmp = fabs(spinor[i]);
     if (tmp > max) max = tmp;
   }

   norm[idx] = max;
   for (int i=0; i<24; i++) copy(out[idx*24+i], spinor[i]/max);

 }

 void new_load_half(float spinor[24], short *in, float *norm, int idx) {

   for (int i=0; i<12; i++) {
     float mag, phase;
     ucopy(mag,((ushort*)in)[idx*24+i*2+0]);
     mag *= norm[idx];
     copy(phase,in[idx*24+i*2+1]);
     phase *= M_PI;
     spinor[2*i+0] = mag*cos(phase);
     spinor[2*i+1] = mag*sin(phase);
   }

 }

 void new_save_half(float spinor[24], short *out, float *norm, int idx) {

   float max = 0.0;
   for (int i=0; i<12; i++) {
     float tmp = sqrt(spinor[2*i+0]*spinor[2*i+0] + spinor[2*i+1]*spinor[2*i+1]);
     if (tmp > max) max = tmp;
   }

   norm[idx] = max;
   for (int i=0; i<12; i++) {
     float phase = atan2(spinor[2*i+1], spinor[2*i+0]) / M_PI;
     float mag = sqrt(spinor[2*i+0]*spinor[2*i+0] + spinor[2*i+1]*spinor[2*i+1]) / max;

     ucopy(((ushort*)out)[idx*24+i*2+0], mag);
     copy(out[idx*24+i*2+1], phase);
   }

 }

 void oldCopyToHalf(short *out, float *norm, float *in, int N) {
   for (int j=0; j<N; j++) {
     float spinor[24];
     for (int i=0; i<24; i++) spinor[i] = in[j*24+i];
     old_save_half(spinor, out, norm, j);
   }

 }

 void oldCopyToFloat(float *out, short *in, float *norm, int N) {
   for (int j=0; j<N; j++) {
     float spinor[24];
     old_load_half(spinor, in, norm, j);
     for (int i=0; i<24; i++) out[j*24+i] = spinor[i];
   }

 }

 void newCopyToHalf(short *out, float *norm, float *in, int N) {
   for (int j=0; j<N; j++) {
     float spinor[24];
     for (int i=0; i<24; i++) spinor[i] = in[j*24+i];
     new_save_half(spinor, out, norm, j);
   }

 }

 void newCopyToFloat(float *out, short *in, float *norm, int N) {
   for (int j=0; j<N; j++) {
     float spinor[24];
     new_load_half(spinor, in, norm, j);
     for (int i=0; i<24; i++) out[j*24+i] = spinor[i];
   }

 }

 void insertNoise(float *field, int N, float power) {
   for (int j=0; j<N; j++) {
     for (int i=0; i<24; i++) {
       field[j*24+i] = 1000*pow(comm_drand(), power);
     }
   }
 }

 double l2(float *a, float *b, int N) {

   double rtn = 0.0;
   for (int j=0; j<N; j++) {
     double dif = 0;
     double nrm = 0.0;
     for (int i=0; i<24; i++) {
       dif += a[j*24+i]*a[j*24+i] - b[j*24+i]*b[j*24+i];
       nrm += a[j*24+i]*a[j*24+i];
     }
     rtn += sqrt(fabs(dif)/nrm);
   }
   return rtn/N;
 }

 int main() {
   const int N = 1000;

   float *ref = (float*)safe_malloc(24*N*sizeof(float));
   short *old_half = (short*)safe_malloc(24*N*sizeof(short));
   float *old_norm = (float*)safe_malloc(N*sizeof(float));
   float *old_recon = (float*)safe_malloc(24*N*sizeof(float));
   short *new_half = (short*)safe_malloc(24*N*sizeof(short));
   float *new_norm = (float*)safe_malloc(N*sizeof(float));
   float *new_recon = (float*)safe_malloc(24*N*sizeof(float));

   for (float power=0.0; power<2.0; power+=0.1) {
     insertNoise(ref, N, power);

     newCopyToHalf(new_half,new_norm,ref,N);
     newCopyToFloat(new_recon,new_half,new_norm,N);

     oldCopyToHalf(old_half,old_norm,ref,N);
     oldCopyToFloat(old_recon,old_half,old_norm,N);

     printf("pow=%e, L2 spinor deviation: old = %e, new = %e, ratio = %e\n",
      power, l2(ref,old_recon,N), l2(ref,new_recon,N),
      l2(ref,old_recon,N) / l2(ref,new_recon,N));

     if (N==1) {
       for (int j=0; j<N; j++) {
   for (int i=0; i<12; i++) {
     printf("power=%4.2f i=%d ref=(%e,%e) old=(%e,%e), new=(%e,%e)\n",
      power, i, ref[j*24+i*2+0], ref[j*24+i*2+1],
      old_recon[j*24+i*2+0], old_recon[j*24+i*2+1],
      new_recon[j*24+i*2+0], new_recon[j*24+i*2+1]);
   }
       }
     }
   }

   host_free(old_norm);
   host_free(old_half);
   host_free(old_recon);
   host_free(new_norm);
   host_free(new_half);
   host_free(new_recon);
   host_free(ref);
 }
newCopyToHalf
void newCopyToHalf(short *out, float *norm, float *in, int N)
Definition: new_half.cu:94

quda::norm
__host__ __device__ ValueType norm(const complex< ValueType > &z)
Returns the magnitude of z squared.
Definition: complex_quda.h:896

ucopy
void ucopy(float &a, const ushort &b)
Definition: new_half.cu:17

MAX_USHORT
#define MAX_USHORT
Definition: new_half.cu:16

host_free
#define host_free(ptr)
Definition: malloc_quda.h:59

quda::sqrt
__host__ __device__ ValueType sqrt(ValueType x)
Definition: complex_quda.h:105

tmp
cudaColorSpinorField * tmp
Definition: covdev_test.cpp:44

new_load_half
void new_load_half(float spinor[24], short *in, float *norm, int idx)
Definition: new_half.cu:43

quda::copy
__host__ __device__ void copy(T1 &a, const T2 &b)
Definition: register_traits.h:114

l2
double l2(float *a, float *b, int N)
Definition: new_half.cu:120

newCopyToFloat
void newCopyToFloat(float *out, short *in, float *norm, int N)
Definition: new_half.cu:103

quda
Definition: blas_cublas.h:6

insertNoise
void insertNoise(float *field, int N, float power)
Definition: new_half.cu:112

b
#define b
Definition: dw_dslash4_core.h:83

main
int main()
Definition: new_half.cu:135

printf
int printf(const char *,...) __attribute__((__format__(__printf__

quda::sin
__host__ __device__ ValueType sin(ValueType x)
Definition: complex_quda.h:40

in
cpuColorSpinorField * in
Definition: staggered_invert_test.cpp:44

quda::atan2
__host__ __device__ ValueType atan2(ValueType x, ValueType y)
Definition: complex_quda.h:65

fused_exterior_ndeg_tm_dslash_cuda_gen.i
int i
start here
Definition: fused_exterior_ndeg_tm_dslash_cuda_gen.py:816

quda::pow
__host__ __device__ ValueType pow(ValueType x, ExponentType e)
Definition: complex_quda.h:100

comm_drand
double comm_drand(void)
Definition: comm_common.cpp:82

register_traits.h
Provides precision abstractions and defines the register precision given the storage precision using ...

new_save_half
void new_save_half(float spinor[24], short *out, float *norm, int idx)
Definition: new_half.cu:57

safe_malloc
#define safe_malloc(size)
Definition: malloc_quda.h:54

oldCopyToFloat
void oldCopyToFloat(float *out, short *in, float *norm, int N)
Definition: new_half.cu:85

old_save_half
void old_save_half(float spinor[24], short *out, float *norm, int idx)
Definition: new_half.cu:30

out
cpuColorSpinorField * out
Definition: staggered_invert_test.cpp:45

ref
cpuColorSpinorField * ref
Definition: staggered_invert_test.cpp:46

idx
int idx
Definition: staggered_fused_exterior_dslash_core.h:355

fabs
double fabs(double)

old_load_half
void old_load_half(float spinor[24], short *in, float *norm, int idx)
Definition: new_half.cu:21

quda::cos
__host__ __device__ ValueType cos(ValueType x)
Definition: complex_quda.h:35

float
float
Definition: CMakeCUDACompilerId.cpp1.ii:12791

oldCopyToHalf
void oldCopyToHalf(short *out, float *norm, float *in, int N)
Definition: new_half.cu:76

a
#define a
Definition: dw_dslash4_core.h:82

spinor
cpuColorSpinorField * spinor
Definition: covdev_test.cpp:41

quda_internal.h