2 Spinor reordering and copying routines. These are implemented to
3 un on both CPU and GPU. Here we are templating on the following:
11 #include <color_spinor_field.h>
12 #include <color_spinor_field_order.h>
13 #include <tune_quda.h>
14 #include <utility> // for std::swap
15 #include <random_quda.h>
19 using namespace colorspinor;
21 template<typename real, int Ns, int Nc, QudaFieldOrder order>
23 typedef typename colorspinor::FieldOrderCB<real,Ns,Nc,1,order> V;
28 Arg(ColorSpinorField &v, RNG &rng) : v(v), nParity(v.SiteSubset()), volumeCB(v.VolumeCB()), rng(rng) { }
31 template<typename real, typename Arg> // Gauss
32 __device__ __host__ inline void genGauss(Arg &arg, cuRNGState& localState, int parity, int x_cb, int s, int c) {
33 real phi = 2.0*M_PI*Random<real>(localState);
34 real radius = Random<real>(localState);
35 radius = sqrt(-1.0 * log(radius));
36 arg.v(parity, x_cb, s, c) = complex<real>(radius*cos(phi),radius*sin(phi));
39 template<typename real, typename Arg> // Uniform
40 __device__ __host__ inline void genUniform(Arg &arg, cuRNGState& localState, int parity, int x_cb, int s, int c) {
41 real x = Random<real>(localState);
42 real y = Random<real>(localState);
43 arg.v(parity, x_cb, s, c) = complex<real>(x, y);
46 /** CPU function to reorder spinor fields. */
47 template <typename real, int Ns, int Nc, QudaNoiseType type, typename Arg> void SpinorNoiseCPU(Arg &arg)
50 for (int parity = 0; parity < arg.nParity; parity++) {
51 for (int x_cb = 0; x_cb < arg.volumeCB; x_cb++) {
52 for (int s = 0; s < Ns; s++) {
53 for (int c = 0; c < Nc; c++) {
54 cuRNGState localState = arg.rng.State()[parity * arg.volumeCB + x_cb];
55 if (type == QUDA_NOISE_GAUSS)
56 genGauss<real>(arg, localState, parity, x_cb, s, c);
57 else if (type == QUDA_NOISE_UNIFORM)
58 genUniform<real>(arg, localState, parity, x_cb, s, c);
59 arg.rng.State()[parity * arg.volumeCB + x_cb] = localState;
66 /** CUDA kernel to reorder spinor fields. Adopts a similar form as the CPU version, using the same inlined functions. */
67 template <typename real, int Ns, int Nc, QudaNoiseType type, typename Arg>
68 __global__ void SpinorNoiseGPU(Arg arg) {
70 int x_cb = blockIdx.x * blockDim.x + threadIdx.x;
71 if (x_cb >= arg.volumeCB) return;
73 int parity = blockIdx.y * blockDim.y + threadIdx.y;
74 if (parity >= arg.nParity) return;
76 cuRNGState localState = arg.rng.State()[parity * arg.volumeCB + x_cb];
77 for (int s=0; s<Ns; s++) {
78 for (int c=0; c<Nc; c++) {
79 if (type == QUDA_NOISE_GAUSS) genGauss<real>(arg, localState, parity, x_cb, s, c);
80 else if (type == QUDA_NOISE_UNIFORM) genUniform<real>(arg, localState, parity, x_cb, s, c);
83 arg.rng.State()[parity * arg.volumeCB + x_cb] = localState;
86 template <typename real, int Ns, int Nc, QudaNoiseType type, typename Arg>
87 class SpinorNoise : TunableVectorY {
89 const ColorSpinorField &meta; // this reference is for meta data only
92 unsigned int sharedBytesPerThread() const { return 0; }
93 unsigned int sharedBytesPerBlock(const TuneParam ¶m) const { return 0; }
94 bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
95 unsigned int minThreads() const { return meta.VolumeCB(); }
98 SpinorNoise(Arg &arg, const ColorSpinorField &meta)
99 : TunableVectorY(meta.SiteSubset()), arg(arg), meta(meta) {
100 strcpy(aux, meta.AuxString());
101 strcat(aux, meta.Location()==QUDA_CUDA_FIELD_LOCATION ? ",GPU" : ",CPU");
104 void apply(const qudaStream_t &stream) {
105 TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
106 qudaLaunchKernel(SpinorNoiseGPU<real, Ns, Nc, type, Arg>, tp, stream, arg);
109 bool advanceTuneParam(TuneParam ¶m) const {
110 if (meta.Location() == QUDA_CUDA_FIELD_LOCATION) return Tunable::advanceTuneParam(param);
114 TuneKey tuneKey() const { return TuneKey(meta.VolString(), typeid(*this).name(), aux); }
115 long long flops() const { return 0; }
116 long long bytes() const { return meta.Bytes(); }
117 void preTune() { arg.rng.backup(); }
118 void postTune(){ arg.rng.restore(); }
121 template <typename real, int Ns, int Nc, QudaFieldOrder order>
122 void spinorNoise(ColorSpinorField &in, RNG &rngstate, QudaNoiseType type) {
123 Arg<real, Ns, Nc, order> arg(in, rngstate);
125 case QUDA_NOISE_GAUSS:
127 SpinorNoise<real, Ns, Nc, QUDA_NOISE_GAUSS, Arg<real, Ns, Nc, order> > noise(arg, in);
131 case QUDA_NOISE_UNIFORM:
133 SpinorNoise<real, Ns, Nc, QUDA_NOISE_UNIFORM, Arg<real, Ns, Nc, order> > noise(arg, in);
138 errorQuda("Noise type %d not implemented", type);
142 /** Decide on the input order*/
143 template <typename real, int Ns, int Nc>
144 void spinorNoise(ColorSpinorField &in, RNG &rngstate, QudaNoiseType type)
146 if (in.FieldOrder() == QUDA_FLOAT2_FIELD_ORDER) {
147 spinorNoise<real,Ns,Nc,QUDA_FLOAT2_FIELD_ORDER>(in, rngstate, type);
148 } else if (in.FieldOrder() == QUDA_FLOAT4_FIELD_ORDER) {
149 spinorNoise<real,Ns,Nc,QUDA_FLOAT4_FIELD_ORDER>(in, rngstate, type);
151 errorQuda("Order %d not defined (Ns=%d, Nc=%d)", in.FieldOrder(), Ns, Nc);
155 template <typename real, int Ns>
156 void spinorNoise(ColorSpinorField &src, RNG& randstates, QudaNoiseType type)
158 if (src.Ncolor() == 3) {
159 spinorNoise<real,Ns,3>(src, randstates, type);
160 } else if (src.Ncolor() == 6) {
161 spinorNoise<real,Ns,6>(src, randstates, type);
162 } else if (src.Ncolor() == 24) {
163 spinorNoise<real,Ns,24>(src, randstates, type);
164 } else if (src.Ncolor() == 32) {
165 spinorNoise<real,Ns,32>(src, randstates, type);
166 } else if (src.Ncolor() == 64) {
167 spinorNoise<real,Ns,64>(src, randstates, type);
168 } else if (src.Ncolor() == 96) {
169 spinorNoise<real,Ns,96>(src, randstates, type);
171 errorQuda("nColor = %d not implemented", src.Ncolor());
175 template <typename real>
176 void spinorNoise(ColorSpinorField &src, RNG& randstates, QudaNoiseType type)
178 if (src.Nspin() == 4) {
180 spinorNoise<real,4>(src, randstates, type);
182 errorQuda("spinorNoise has not been built for nSpin=%d fields", src.Nspin());
184 } else if (src.Nspin() == 2) {
186 spinorNoise<real,2>(src, randstates, type);
188 errorQuda("spinorNoise has not been built for nSpin=%d fields", src.Nspin());
190 } else if (src.Nspin() == 1) {
192 spinorNoise<real,1>(src, randstates, type);
194 errorQuda("spinorNoise has not been built for nSpin=%d fields", src.Nspin());
197 errorQuda("Nspin = %d not implemented", src.Nspin());
201 void spinorNoise(ColorSpinorField &src_, RNG &randstates, QudaNoiseType type)
203 // if src is a CPU field then create GPU field
204 ColorSpinorField *src = &src_;
205 if (src_.Location() == QUDA_CPU_FIELD_LOCATION || src_.Precision() < QUDA_SINGLE_PRECISION) {
206 ColorSpinorParam param(src_);
207 QudaPrecision prec = std::max(src_.Precision(), QUDA_SINGLE_PRECISION);
208 param.setPrecision(prec, prec, true); // change to native field order
209 param.create = QUDA_NULL_FIELD_CREATE;
210 param.location = QUDA_CUDA_FIELD_LOCATION;
211 src = ColorSpinorField::Create(param);
214 switch (src->Precision()) {
215 case QUDA_DOUBLE_PRECISION: spinorNoise<double>(*src, randstates, type); break;
216 case QUDA_SINGLE_PRECISION: spinorNoise<float>(*src, randstates, type); break;
217 default: errorQuda("Precision %d not implemented", src->Precision());
221 src_ = *src; // upload result
226 void spinorNoise(ColorSpinorField &src, unsigned long long seed, QudaNoiseType type)
228 RNG *randstates = new RNG(src, seed);
230 spinorNoise(src, *randstates, type);
231 randstates->Release();