13 #ifdef GPU_CLOVER_DIRAC 20 #if (CUDA_VERSION < 8000) 21 #define MAX_NVECTOR 1 // multi-vector code doesn't seem to work well with CUDA 7.x 26 template<
typename Float,
typename Output,
typename InputA,
typename InputB>
27 struct CloverSigmaOprodArg {
29 InputA inA[MAX_NVECTOR];
30 InputB inB[MAX_NVECTOR];
31 Float
coeff[MAX_NVECTOR][2];
35 CloverSigmaOprodArg(Output &oprod, InputA *inA_, InputB *inB_,
36 const std::vector<std::vector<double> > &coeff_,
37 const GaugeField &meta,
int nvector)
38 : oprod(oprod),
length(meta.VolumeCB()), nvector(nvector)
40 for (
int i=0;
i<nvector;
i++) {
49 template <
typename real,
int nvector,
int mu,
int nu,
int parity,
typename Arg>
50 inline __device__
void sigmaOprod(Arg &
arg,
int idx) {
55 for (
int i=0;
i<nvector;
i++) {
56 ColorSpinor<real,3,4> A, B;
62 ColorSpinor<real,3,4> C = A.sigma(nu,
mu);
66 result -=
conj(result);
71 arg.oprod.save(reinterpret_cast<real*>(temp.data),
idx, (
mu-1)*
mu/2 + nu,
parity);
74 template<
int nvector,
typename real,
typename Output,
typename InputA,
typename InputB>
75 __global__
void sigmaOprodKernel(CloverSigmaOprodArg<real, Output, InputA, InputB>
arg) {
79 int mu_nu = blockIdx.z*
blockDim.z + threadIdx.z;
81 if (
idx >=
arg.length)
return;
82 if (mu_nu >= 6)
return;
87 case 0: sigmaOprod<real, nvector, 1, 0, 0>(
arg,
idx);
break;
88 case 1: sigmaOprod<real, nvector, 2, 0, 0>(
arg,
idx);
break;
89 case 2: sigmaOprod<real, nvector, 2, 1, 0>(
arg,
idx);
break;
90 case 3: sigmaOprod<real, nvector, 3, 0, 0>(
arg,
idx);
break;
91 case 4: sigmaOprod<real, nvector, 3, 1, 0>(
arg,
idx);
break;
92 case 5: sigmaOprod<real, nvector, 3, 2, 0>(
arg,
idx);
break;
97 case 0: sigmaOprod<real, nvector, 1, 0, 1>(
arg,
idx);
break;
98 case 1: sigmaOprod<real, nvector, 2, 0, 1>(
arg,
idx);
break;
99 case 2: sigmaOprod<real, nvector, 2, 1, 1>(
arg,
idx);
break;
100 case 3: sigmaOprod<real, nvector, 3, 0, 1>(
arg,
idx);
break;
101 case 4: sigmaOprod<real, nvector, 3, 1, 1>(
arg,
idx);
break;
102 case 5: sigmaOprod<real, nvector, 3, 2, 1>(
arg,
idx);
break;
111 template<
typename Float,
typename Output,
typename InputA,
typename InputB>
112 class CloverSigmaOprod :
public TunableVectorYZ {
115 CloverSigmaOprodArg<Float,Output,InputA,InputB> &
arg;
116 const GaugeField &meta;
118 unsigned int sharedBytesPerThread()
const {
return 0; }
119 unsigned int sharedBytesPerBlock(
const TuneParam &)
const {
return 0; }
121 unsigned int minThreads()
const {
return arg.length; }
122 bool tuneGridDim()
const {
return false; }
125 CloverSigmaOprod(CloverSigmaOprodArg<Float,Output,InputA,InputB> &
arg,
const GaugeField &meta)
126 : TunableVectorYZ(2,6),
arg(
arg), meta(meta) {
127 writeAuxString(
"prec=%lu,stride=%d,nvector=%d",
sizeof(Float),
arg.inA[0].Stride(),
arg.nvector);
131 virtual ~CloverSigmaOprod() {}
133 void apply(
const cudaStream_t &
stream){
136 switch(
arg.nvector) {
137 case 1: sigmaOprodKernel< 1><<<tp.grid,tp.block,tp.shared_bytes,
stream>>>(
arg);
break;
138 case 2: sigmaOprodKernel< 2><<<tp.grid,tp.block,tp.shared_bytes,
stream>>>(
arg);
break;
139 case 3: sigmaOprodKernel< 3><<<tp.grid,tp.block,tp.shared_bytes,
stream>>>(
arg);
break;
140 case 4: sigmaOprodKernel< 4><<<tp.grid,tp.block,tp.shared_bytes,
stream>>>(
arg);
break;
141 case 5: sigmaOprodKernel< 5><<<tp.grid,tp.block,tp.shared_bytes,
stream>>>(
arg);
break;
142 case 6: sigmaOprodKernel< 6><<<tp.grid,tp.block,tp.shared_bytes,
stream>>>(
arg);
break;
143 case 7: sigmaOprodKernel< 7><<<tp.grid,tp.block,tp.shared_bytes,
stream>>>(
arg);
break;
144 case 8: sigmaOprodKernel< 8><<<tp.grid,tp.block,tp.shared_bytes,
stream>>>(
arg);
break;
145 case 9: sigmaOprodKernel< 9><<<tp.grid,tp.block,tp.shared_bytes,
stream>>>(
arg);
break;
148 errorQuda(
"No CPU support for staggered outer-product calculation\n");
152 void preTune() { this->arg.oprod.save(); }
153 void postTune() { this->arg.oprod.load(); }
155 long long flops()
const {
156 return (2*(
long long)
arg.length)*6*((0 + 144 + 18)*
arg.nvector + 18);
158 long long bytes()
const {
159 return (2*(
long long)
arg.length)*6*((
arg.inA[0].Bytes() +
arg.inB[0].Bytes())*
arg.nvector + 2*
arg.oprod.Bytes());
162 TuneKey tuneKey()
const {
163 return TuneKey(meta.VolString(),
"CloverSigmaOprod", aux);
167 template<
typename Float,
typename Output,
typename InputA,
typename InputB>
169 std::vector<std::vector<double> > &
coeff,
int nvector) {
171 CloverSigmaOprodArg<Float,Output,InputA,InputB>
arg(oprod, inA, inB,
coeff,
out, nvector);
172 CloverSigmaOprod<Float,Output,InputA,InputB> sigma_oprod(
arg,
out);
173 sigma_oprod.apply(0);
176 #endif // GPU_CLOVER_FORCE 179 std::vector<ColorSpinorField*> &
x,
180 std::vector<ColorSpinorField*> &
p,
181 std::vector<std::vector<double> > &
coeff)
184 #ifdef GPU_CLOVER_DIRAC 185 if (
x.size() > MAX_NVECTOR) {
187 std::vector<ColorSpinorField*> x0(
x.begin(),
x.begin()+
x.size()/2);
188 std::vector<ColorSpinorField*> p0(
p.begin(),
p.begin()+
p.size()/2);
189 std::vector<std::vector<double> > coeff0(
coeff.begin(),
coeff.begin()+
coeff.size()/2);
190 for (
unsigned int i=0;
i<coeff0.size();
i++) {
191 coeff0[
i].reserve(2); coeff0[
i][0] =
coeff[
i][0]; coeff0[
i][1] =
coeff[
i][1];
195 std::vector<ColorSpinorField*> x1(
x.begin()+
x.size()/2,
x.end());
196 std::vector<ColorSpinorField*> p1(
p.begin()+
p.size()/2,
p.end());
197 std::vector<std::vector<double> > coeff1(
coeff.begin()+
coeff.size()/2,
coeff.end());
198 for (
unsigned int i=0;
i<coeff1.size();
i++) {
217 for (
unsigned int i=0;
i<
x.size();
i++) {
218 spinorA[
i].
set(*dynamic_cast<cudaColorSpinorField*>(
x[
i]));
219 spinorB[
i].
set(*dynamic_cast<cudaColorSpinorField*>(
p[
i]));
223 oprod, spinorA, spinorB,
coeff,
x.size());
228 #else // GPU_CLOVER_DIRAC not defined 229 errorQuda(
"Clover Dirac operator has not been built!");
void set(const cudaColorSpinorField &x)
QudaVerbosity getVerbosity()
std::complex< double > Complex
__device__ __host__ Matrix< complex< Float >, Nc > outerProdSpinTrace(const ColorSpinor< Float, Nc, Ns > &a, const ColorSpinor< Float, Nc, Ns > &b)
static __inline__ size_t p
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Main header file for host and device accessors to GaugeFields.
cpuColorSpinorField * out
__host__ __device__ ValueType arg(const complex< ValueType > &z)
Returns the phase angle of z.
QudaGaugeFieldOrder Order() const
__host__ __device__ ValueType conj(ValueType x)
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
void computeCloverSigmaOprod(GaugeField &oprod, std::vector< ColorSpinorField *> &x, std::vector< ColorSpinorField *> &p, std::vector< std::vector< double > > &coeff)
Compute the outer product from the solver solution fields arising from the diagonal term of the fermi...
QudaPrecision Precision() const