19 #define COMPILE_HISQ_DP_18
20 #define COMPILE_HISQ_DP_12
21 #define COMPILE_HISQ_SP_18
22 #define COMPILE_HISQ_SP_12
25 #define HISQ_SITE_MATRIX_LOAD_TEX 1
26 #define HISQ_NEW_OPROD_LOAD_TEX 1
28 #ifdef USE_TEXTURE_OBJECTS
29 #define TEX1DFETCH(type, tex, idx) tex1Dfetch<type>((tex), idx)
31 #define TEX1DFETCH(type, tex, idx) tex1Dfetch((tex), idx)
35 #if (__COMPUTE_CAPABILITY__ >= 130)
37 template<
typename Tex>
38 static __inline__ __device__
double fetch_double(Tex t,
int i)
41 return __hiloint2double(v.y, v.x);
44 template <
typename Tex>
45 static __inline__ __device__ double2
fetch_double2(Tex t,
int i)
48 return make_double2(__hiloint2double(v.y, v.x), __hiloint2double(v.w, v.z));
51 static __inline__ __device__ double2 fetch_double2_old(texture<int4, 1> t,
int i)
53 int4 v = tex1Dfetch(t,i);
54 return make_double2(__hiloint2double(v.y, v.x), __hiloint2double(v.w, v.z));
57 #endif //__COMPUTE_CAPABILITY__ >= 130
64 namespace fermion_force {
66 struct hisq_kernel_param_t{
73 int color_matrix_stride;
78 int half_volume = param.
X[0]*param.
X[1]*param.
X[2]*param.
X[3]/2;
80 int extended_half_volume = (param.
X[0]+4)*(param.
X[1]+4)*(param.
X[2]+4)*(param.
X[3]+4)/2;
81 thin_link_stride = extended_half_volume + param.
site_ga_pad;
82 color_matrix_stride = extended_half_volume;
85 color_matrix_stride = half_volume;
87 momentum_stride = half_volume + param.
mom_ga_pad;
93 texture<int4, 1> thinLink0TexDouble;
94 texture<int4, 1> thinLink1TexDouble;
97 texture<float2, 1, cudaReadModeElementType> thinLink0TexSingle;
98 texture<float2, 1, cudaReadModeElementType> thinLink1TexSingle;
100 texture<float4, 1, cudaReadModeElementType> thinLink0TexSingle_recon;
101 texture<float4, 1, cudaReadModeElementType> thinLink1TexSingle_recon;
104 texture<int4, 1> newOprod0TexDouble;
105 texture<int4, 1> newOprod1TexDouble;
106 texture<float2, 1, cudaReadModeElementType> newOprod0TexSingle;
107 texture<float2, 1, cudaReadModeElementType> newOprod1TexSingle;
111 inline __device__ __host__
int linkIndex(
int x[],
int dx[],
const int X[4]) {
113 for (
int i=0; i<4; i++) y[i] = (x[i] + dx[i] + X[i]) % X[i];
114 int idx = (((y[3]*X[2] + y[2])*X[1] + y[1])*X[0] + y[0]) >> 1;
118 inline __device__ __host__
void updateCoords(
int x[],
int dir,
int shift,
const int X[4],
const int partitioned){
121 x[dir] = (partitioned || (x[dir] != X[dir]+1)) ? x[dir]+1 : 2;
122 }
else if(shift == -1){
123 x[dir] = (partitioned || (x[dir] != 2)) ? x[dir]-1 : X[dir]+1;
126 x[dir] = (x[dir]+shift + X[dir])%X[dir];
132 __device__ __host__
inline void getCoords(
int x[4],
int cb_index,
const int X[4],
int parity)
134 x[3] = cb_index/(X[2]*X[1]*X[0]/2);
135 x[2] = (cb_index/(X[1]*X[0]/2)) % X[2];
136 x[1] = (cb_index/(X[0]/2)) % X[1];
137 x[0] = 2*(cb_index%(X[0]/2)) + ((x[3]+x[2]+x[1]+
parity)&1);
143 __device__ __host__
inline int posDir(
int dir){
144 return (dir >= 4) ? 7-dir : dir;
161 inline __device__ float2
operator*(
float a,
const float2 & b)
163 return make_float2(a*b.x,a*b.y);
166 inline __device__ double2
operator*(
double a,
const double2 & b)
168 return make_double2(a*b.x,a*b.y);
171 inline __device__
const float2 &
operator+=(float2 & a,
const float2 & b)
178 inline __device__
const double2 &
operator+=(double2 & a,
const double2 & b)
185 inline __device__
const float4 &
operator+=(float4 & a,
const float4 & b)
202 struct RealTypeId<float2>
208 struct RealTypeId<double2>
216 void adjointMatrix(T*
mat)
218 #define CONJ_INDEX(i,j) j*3 + i
221 mat[CONJ_INDEX(0,0)] =
Conj(mat[0]);
222 mat[CONJ_INDEX(1,1)] =
Conj(mat[4]);
223 mat[CONJ_INDEX(2,2)] =
Conj(mat[8]);
225 mat[CONJ_INDEX(1,0)] =
Conj(mat[3]);
226 mat[CONJ_INDEX(0,1)] =
tmp;
228 mat[CONJ_INDEX(2,0)] =
Conj(mat[6]);
229 mat[CONJ_INDEX(0,2)] =
tmp;
231 mat[CONJ_INDEX(2,1)] =
Conj(mat[7]);
232 mat[CONJ_INDEX(1,2)] =
tmp;
239 template<
int N,
class T>
242 int dir,
int idx, T*
const mat,
int oddness,
int stride)
244 const T*
const field = (oddness)?field_odd:field_even;
245 for(
int i = 0;i < N ;i++){
246 mat[i] = field[idx + dir*N*stride + i*stride];
254 int dir,
int idx, T*
const mat,
int oddness,
int stride)
256 loadMatrixFromField<9> (field_even, field_odd, dir,
idx,
mat, oddness, stride);
264 int dir,
int idx, float2*
const mat,
int oddness,
int stride)
266 const float4*
const field = oddness?field_odd: field_even;
268 tmp = field[idx + dir*stride*3];
269 mat[0] = make_float2(tmp.x, tmp.y);
270 mat[1] = make_float2(tmp.z, tmp.w);
271 tmp = field[idx + dir*stride*3 + stride];
272 mat[2] = make_float2(tmp.x, tmp.y);
273 mat[3] = make_float2(tmp.z, tmp.w);
274 tmp = field[idx + dir*stride*3 + 2*stride];
275 mat[4] = make_float2(tmp.x, tmp.y);
276 mat[5] = make_float2(tmp.z, tmp.w);
282 void loadMatrixFromField(
const T*
const field_even,
const T*
const field_odd,
int idx, T*
const mat,
int oddness,
int stride)
284 const T*
const field = (oddness)?field_odd:field_even;
286 mat[1] = field[idx + stride];
287 mat[2] = field[idx + stride*2];
288 mat[3] = field[idx + stride*3];
289 mat[4] = field[idx + stride*4];
290 mat[5] = field[idx + stride*5];
291 mat[6] = field[idx + stride*6];
292 mat[7] = field[idx + stride*7];
293 mat[8] = field[idx + stride*8];
301 double2*
const field_even, double2*
const field_odd,
int oddness,
int stride){
302 double2*
const field = (oddness)?field_odd: field_even;
305 #if (HISQ_NEW_OPROD_LOAD_TEX == 1)
306 value[0] =
READ_DOUBLE2_TEXTURE( ((oddness)?newOprod1TexDouble:newOprod0TexDouble), field, idx+dir*stride*9);
307 value[1] =
READ_DOUBLE2_TEXTURE( ((oddness)?newOprod1TexDouble:newOprod0TexDouble), field, idx+dir*stride*9 + stride);
308 value[2] =
READ_DOUBLE2_TEXTURE( ((oddness)?newOprod1TexDouble:newOprod0TexDouble), field, idx+dir*stride*9 + 2*stride);
309 value[3] =
READ_DOUBLE2_TEXTURE( ((oddness)?newOprod1TexDouble:newOprod0TexDouble), field, idx+dir*stride*9 + 3*stride);
310 value[4] =
READ_DOUBLE2_TEXTURE( ((oddness)?newOprod1TexDouble:newOprod0TexDouble), field, idx+dir*stride*9 + 4*stride);
311 value[5] =
READ_DOUBLE2_TEXTURE( ((oddness)?newOprod1TexDouble:newOprod0TexDouble), field, idx+dir*stride*9 + 5*stride);
312 value[6] =
READ_DOUBLE2_TEXTURE( ((oddness)?newOprod1TexDouble:newOprod0TexDouble), field, idx+dir*stride*9 + 6*stride);
313 value[7] =
READ_DOUBLE2_TEXTURE( ((oddness)?newOprod1TexDouble:newOprod0TexDouble), field, idx+dir*stride*9 + 7*stride);
314 value[8] =
READ_DOUBLE2_TEXTURE( ((oddness)?newOprod1TexDouble:newOprod0TexDouble), field, idx+dir*stride*9 + 8*stride);
316 for(
int i=0; i<9; ++i) value[i] = field[i];
319 field[idx + dir*stride*9] = value[0] + coeff*mat[0];
320 field[idx + dir*stride*9 + stride] = value[1] + coeff*mat[1];
321 field[idx + dir*stride*9 + stride*2] = value[2] + coeff*mat[2];
322 field[idx + dir*stride*9 + stride*3] = value[3] + coeff*mat[3];
323 field[idx + dir*stride*9 + stride*4] = value[4] + coeff*mat[4];
324 field[idx + dir*stride*9 + stride*5] = value[5] + coeff*mat[5];
325 field[idx + dir*stride*9 + stride*6] = value[6] + coeff*mat[6];
326 field[idx + dir*stride*9 + stride*7] = value[7] + coeff*mat[7];
327 field[idx + dir*stride*9 + stride*8] = value[8] + coeff*mat[8];
336 float2*
const field_even, float2*
const field_odd,
int oddness,
int stride){
337 float2*
const field = (oddness)?field_odd: field_even;
340 #if (HISQ_NEW_OPROD_LOAD_TEX == 1)
341 value[0] = tex1Dfetch( ((oddness)?newOprod1TexSingle:newOprod0TexSingle), idx+dir*stride*9);
342 value[1] = tex1Dfetch( ((oddness)?newOprod1TexSingle:newOprod0TexSingle), idx+dir*stride*9 + stride);
343 value[2] = tex1Dfetch( ((oddness)?newOprod1TexSingle:newOprod0TexSingle), idx+dir*stride*9 + 2*stride);
344 value[3] = tex1Dfetch( ((oddness)?newOprod1TexSingle:newOprod0TexSingle), idx+dir*stride*9 + 3*stride);
345 value[4] = tex1Dfetch( ((oddness)?newOprod1TexSingle:newOprod0TexSingle), idx+dir*stride*9 + 4*stride);
346 value[5] = tex1Dfetch( ((oddness)?newOprod1TexSingle:newOprod0TexSingle), idx+dir*stride*9 + 5*stride);
347 value[6] = tex1Dfetch( ((oddness)?newOprod1TexSingle:newOprod0TexSingle), idx+dir*stride*9 + 6*stride);
348 value[7] = tex1Dfetch( ((oddness)?newOprod1TexSingle:newOprod0TexSingle), idx+dir*stride*9 + 7*stride);
349 value[8] = tex1Dfetch( ((oddness)?newOprod1TexSingle:newOprod0TexSingle), idx+dir*stride*9 + 8*stride);
351 for(
int i=0; i<9; ++i) value[i] = field[i];
353 field[idx + dir*stride*9] = value[0] + coeff*mat[0];
354 field[idx + dir*stride*9 + stride] = value[1] + coeff*mat[1];
355 field[idx + dir*stride*9 + stride*2] = value[2] + coeff*mat[2];
356 field[idx + dir*stride*9 + stride*3] = value[3] + coeff*mat[3];
357 field[idx + dir*stride*9 + stride*4] = value[4] + coeff*mat[4];
358 field[idx + dir*stride*9 + stride*5] = value[5] + coeff*mat[5];
359 field[idx + dir*stride*9 + stride*6] = value[6] + coeff*mat[6];
360 field[idx + dir*stride*9 + stride*7] = value[7] + coeff*mat[7];
361 field[idx + dir*stride*9 + stride*8] = value[8] + coeff*mat[8];
369 template<
class T,
class U>
372 T*
const field_even, T*
const field_odd,
int oddness,
int stride)
374 T*
const field = (oddness)?field_odd: field_even;
375 field[idx + dir*stride*9] += coeff*mat[0];
376 field[idx + dir*stride*9 + stride] += coeff*mat[1];
377 field[idx + dir*stride*9 + stride*2] += coeff*mat[2];
378 field[idx + dir*stride*9 + stride*3] += coeff*mat[3];
379 field[idx + dir*stride*9 + stride*4] += coeff*mat[4];
380 field[idx + dir*stride*9 + stride*5] += coeff*mat[5];
381 field[idx + dir*stride*9 + stride*6] += coeff*mat[6];
382 field[idx + dir*stride*9 + stride*7] += coeff*mat[7];
383 field[idx + dir*stride*9 + stride*8] += coeff*mat[8];
389 template<
class T,
class U>
391 void addMatrixToField(
const T*
const mat,
int idx, U coeff, T*
const field_even,
392 T*
const field_odd,
int oddness,
int stride)
394 T*
const field = (oddness)?field_odd: field_even;
395 field[
idx ] += coeff*mat[0];
396 field[idx + stride] += coeff*mat[1];
397 field[idx + stride*2] += coeff*mat[2];
398 field[idx + stride*3] += coeff*mat[3];
399 field[idx + stride*4] += coeff*mat[4];
400 field[idx + stride*5] += coeff*mat[5];
401 field[idx + stride*6] += coeff*mat[6];
402 field[idx + stride*7] += coeff*mat[7];
403 field[idx + stride*8] += coeff*mat[8];
408 template<
class T,
class U>
410 void addMatrixToField_test(
const T*
const mat,
int idx, U coeff, T*
const field_even,
411 T*
const field_odd,
int oddness,
int stride)
413 T*
const field = (oddness)?field_odd: field_even;
415 field[
idx ] += coeff*mat[0];
416 field[idx + stride] += coeff*mat[1];
417 field[idx + stride*2] += coeff*mat[2];
418 field[idx + stride*3] += coeff*mat[3];
419 field[idx + stride*4] += coeff*mat[4];
420 field[idx + stride*5] += coeff*mat[5];
421 field[idx + stride*6] += coeff*mat[6];
422 field[idx + stride*7] += coeff*mat[7];
423 field[idx + stride*8] += coeff*mat[8];
425 #if (!defined(__CUDA_ARCH__) || (__COMPUTE_CAPABILITY__>=200))
426 printf(
"value is coeff(%f) * mat[0].x(%f)=%f\n", coeff, mat[0].x, field[idx].x);
433 void storeMatrixToField(
const T*
const mat,
int dir,
int idx, T*
const field_even, T*
const field_odd,
int oddness,
int stride)
435 T*
const field = (oddness)?field_odd: field_even;
436 field[idx + dir*stride*9] = mat[0];
437 field[idx + dir*stride*9 + stride] = mat[1];
438 field[idx + dir*stride*9 + stride*2] = mat[2];
439 field[idx + dir*stride*9 + stride*3] = mat[3];
440 field[idx + dir*stride*9 + stride*4] = mat[4];
441 field[idx + dir*stride*9 + stride*5] = mat[5];
442 field[idx + dir*stride*9 + stride*6] = mat[6];
443 field[idx + dir*stride*9 + stride*7] = mat[7];
444 field[idx + dir*stride*9 + stride*8] = mat[8];
452 void storeMatrixToField(
const T*
const mat,
int idx, T*
const field_even, T*
const field_odd,
int oddness,
int stride)
454 T*
const field = (oddness)?field_odd: field_even;
456 field[idx + stride] = mat[1];
457 field[idx + stride*2] = mat[2];
458 field[idx + stride*3] = mat[3];
459 field[idx + stride*4] = mat[4];
460 field[idx + stride*5] = mat[5];
461 field[idx + stride*6] = mat[6];
462 field[idx + stride*7] = mat[7];
463 field[idx + stride*8] = mat[8];
469 template<
class T,
class U>
471 void storeMatrixToMomentumField(
const T*
const mat,
int dir,
int idx, U coeff,
472 T*
const mom_even, T*
const mom_odd,
int oddness,
int stride)
474 T*
const mom_field = (oddness)?mom_odd:mom_even;
476 temp2.x = (mat[1].x - mat[3].x)*0.5*coeff;
477 temp2.y = (mat[1].y + mat[3].y)*0.5*coeff;
478 mom_field[idx + dir*stride*5] = temp2;
480 temp2.x = (mat[2].x - mat[6].x)*0.5*coeff;
481 temp2.y = (mat[2].y + mat[6].y)*0.5*coeff;
482 mom_field[idx + dir*stride*5 + stride] = temp2;
484 temp2.x = (mat[5].x - mat[7].x)*0.5*coeff;
485 temp2.y = (mat[5].y + mat[7].y)*0.5*coeff;
486 mom_field[idx + dir*stride*5 + stride*2] = temp2;
488 const typename RealTypeId<T>::Type temp = (mat[0].y + mat[4].y + mat[8].y)*0.3333333333333333333333333;
489 temp2.x = (mat[0].y-temp)*coeff;
490 temp2.y = (mat[4].y-temp)*coeff;
491 mom_field[idx + dir*stride*5 + stride*3] = temp2;
493 temp2.x = (mat[8].y - temp)*coeff;
495 mom_field[idx + dir*stride*5 + stride*4] = temp2;
501 template<
int pos_dir,
int odd_lattice>
504 static const int result = -1;
508 struct CoeffSign<0,1>
510 static const int result = -1;
514 struct CoeffSign<0,0>
516 static const int result = 1;
520 struct CoeffSign<1,1>
522 static const int result = 1;
525 template<
int odd_lattice>
528 static const int result = 1;
534 static const int result = -1;
537 template<
class RealX>
540 static const int result=9;
544 struct ArrayLength<float4>
546 static const int result=5;
553 template<
class RealA,
int oddBit>
555 do_one_link_term_kernel(
const RealA*
const oprodEven,
const RealA*
const oprodOdd,
556 typename RealTypeId<RealA>::Type coeff,
559 int sid = blockIdx.x * blockDim.x + threadIdx.x;
560 if (sid >= kparam.threads)
return;
562 int dx[4] = {0,0,0,0};
565 int E[4] = {kparam.X[0]+4, kparam.X[1]+4, kparam.X[2]+4, kparam.X[3]+4};
566 for(
int dir=0; dir<4; ++dir) x[dir] += 2;
572 RealA COLOR_MAT_W[ArrayLength<RealA>::result];
581 __device__
void loadLink(
const double2*
const linkEven,
const double2*
const linkOdd,
int dir,
int idx, double2*
const var,
int oddness,
int stride){
582 #if (HISQ_SITE_MATRIX_LOAD_TEX == 1)
590 void loadLink<12>(
const double2*
const linkEven,
const double2*
const linkOdd,
int dir,
int idx, double2*
const var,
int oddness,
int stride){
591 #if (HISQ_SITE_MATRIX_LOAD_TEX == 1)
599 __device__
void loadLink(
const float4*
const linkEven,
const float4*
const linkOdd,
int dir,
int idx, float2*
const var,
int oddness,
int stride){
600 #if (HISQ_SITE_MATRIX_LOAD_TEX == 1)
608 __device__
void loadLink(
const float2*
const linkEven,
const float2*
const linkOdd,
int dir,
int idx, float2*
const var ,
int oddness,
int stride){
609 #if (HISQ_SITE_MATRIX_LOAD_TEX == 1)
618 #define DD_CONCAT(n,r) n ## r ## kernel
620 #define HISQ_KERNEL_NAME(a,b) DD_CONCAT(a,b)
655 template<
class RealA,
class RealB>
656 class MiddleLink :
public Tunable {
659 const cudaGaugeField &link;
660 const cudaGaugeField &oprod;
661 const cudaGaugeField &Qprev;
664 const typename RealTypeId<RealA>::Type &
coeff;
668 cudaGaugeField &newOprod;
669 const hisq_kernel_param_t &
kparam;
671 unsigned int sharedBytesPerThread()
const {
return 0; }
672 unsigned int sharedBytesPerBlock(
const TuneParam &)
const {
return 0; }
676 bool tuneGridDim()
const {
return false; }
677 unsigned int minThreads()
const {
return kparam.threads; }
680 MiddleLink(
const cudaGaugeField &link,
681 const cudaGaugeField &oprod,
682 const cudaGaugeField &Qprev,
684 const typename RealTypeId<RealA>::Type &coeff,
688 cudaGaugeField &newOprod,
689 const hisq_kernel_param_t &kparam) :
690 link(link), oprod(oprod), Qprev(Qprev), sig(sig), mu(mu),
691 coeff(coeff), Pmu(Pmu), P3(P3), Qmu(Qmu), newOprod(newOprod), kparam(kparam)
694 MiddleLink(
const cudaGaugeField &link,
695 const cudaGaugeField &oprod,
697 const typename RealTypeId<RealA>::Type &coeff,
701 cudaGaugeField &newOprod,
702 const hisq_kernel_param_t &kparam) :
703 link(link), oprod(oprod), Qprev(link), sig(sig), mu(mu),
704 coeff(coeff), Pmu(Pmu), P3(P3), Qmu(Qmu), newOprod(newOprod), kparam(kparam)
706 virtual ~MiddleLink() { ; }
708 TuneKey tuneKey()
const {
709 std::stringstream vol, aux;
710 vol << kparam.D[0] <<
"x";
711 vol << kparam.D[1] <<
"x";
712 vol << kparam.D[2] <<
"x";
714 aux <<
"threads=" << kparam.threads <<
",prec=" << link.Precision();
715 aux <<
",recon=" << link.Reconstruct() <<
",sig=" <<
sig <<
",mu=" <<
mu;
716 return TuneKey(vol.str().c_str(),
typeid(*this).name(), aux.str().c_str());
720 #define CALL_ARGUMENTS(typeA, typeB) <<<tp.grid, tp.block>>> \
721 ((typeA*)oprod.Even_p(), (typeA*)oprod.Odd_p(), \
722 (typeA*)Qprev_even, (typeA*)Qprev_odd, \
723 (typeB*)link.Even_p(), (typeB*)link.Odd_p(), \
725 (typeA*)Pmu.Even_p(), (typeA*)Pmu.Odd_p(), \
726 (typeA*)P3.Even_p(), (typeA*)P3.Odd_p(), \
727 (typeA*)Qmu.Even_p(), (typeA*)Qmu.Odd_p(), \
728 (typeA*)newOprod.Even_p(), (typeA*)newOprod.Odd_p(), kparam)
731 #define CALL_MIDDLE_LINK_KERNEL(sig_sign, mu_sign) \
732 if(oddness_change ==0 ){ \
733 if(sizeof(RealA) == sizeof(float2)){ \
734 if(recon == QUDA_RECONSTRUCT_NO){ \
735 do_middle_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float2); \
736 do_middle_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float2); \
738 do_middle_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float4); \
739 do_middle_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float4); \
742 if(recon == QUDA_RECONSTRUCT_NO){ \
743 do_middle_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \
744 do_middle_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \
746 do_middle_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \
747 do_middle_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \
751 if(sizeof(RealA) == sizeof(float2)){ \
752 if(recon == QUDA_RECONSTRUCT_NO){ \
753 do_middle_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float2); \
754 do_middle_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float2); \
756 do_middle_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float4); \
757 do_middle_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float4); \
760 if(recon == QUDA_RECONSTRUCT_NO){ \
761 do_middle_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \
762 do_middle_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \
764 do_middle_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \
765 do_middle_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \
772 void apply(
const cudaStream_t &
stream) {
775 int oddness_change = (kparam.base_idx[0] + kparam.base_idx[1]
776 + kparam.base_idx[2] + kparam.base_idx[3])&1;
778 const void *Qprev_even = (&Qprev == &link) ? NULL : Qprev.Even_p();
779 const void *Qprev_odd = (&Qprev == &link) ? NULL : Qprev.Odd_p();
783 CALL_MIDDLE_LINK_KERNEL(1,1);
785 CALL_MIDDLE_LINK_KERNEL(1,0);
787 CALL_MIDDLE_LINK_KERNEL(0,1);
789 CALL_MIDDLE_LINK_KERNEL(0,0);
793 #undef CALL_ARGUMENTS
794 #undef CALL_MIDDLE_LINK_KERNEL
810 long long flops()
const {
return 0; }
814 template<
class RealA,
class RealB>
815 class LepageMiddleLink :
public Tunable {
818 const cudaGaugeField &link;
819 const cudaGaugeField &oprod;
820 const cudaGaugeField &Qprev;
823 const typename RealTypeId<RealA>::Type &
coeff;
825 cudaGaugeField &newOprod;
826 const hisq_kernel_param_t &
kparam;
828 unsigned int sharedBytesPerThread()
const {
return 0; }
829 unsigned int sharedBytesPerBlock(
const TuneParam &)
const {
return 0; }
832 bool tuneGridDim()
const {
return false; }
833 unsigned int minThreads()
const {
return kparam.threads; }
836 LepageMiddleLink(
const cudaGaugeField &link,
837 const cudaGaugeField &oprod,
838 const cudaGaugeField &Qprev,
840 const typename RealTypeId<RealA>::Type &coeff,
841 cudaGaugeField &
P3, cudaGaugeField &newOprod,
842 const hisq_kernel_param_t &kparam) :
843 link(link), oprod(oprod), Qprev(Qprev), sig(sig), mu(mu),
844 coeff(coeff), P3(P3), newOprod(newOprod), kparam(kparam)
846 virtual ~LepageMiddleLink() { ; }
848 TuneKey tuneKey()
const {
849 std::stringstream vol, aux;
850 vol << kparam.D[0] <<
"x";
851 vol << kparam.D[1] <<
"x";
852 vol << kparam.D[2] <<
"x";
854 aux <<
"threads=" << kparam.threads <<
",prec=" << link.Precision();
855 aux <<
",recon=" << link.Reconstruct() <<
",sig=" <<
sig <<
",mu=" <<
mu;
856 return TuneKey(vol.str().c_str(),
typeid(*this).name(), aux.str().c_str());
859 #define CALL_ARGUMENTS(typeA, typeB) <<<tp.grid, tp.block>>> \
860 ((typeA*)oprod.Even_p(), (typeA*)oprod.Odd_p(), \
861 (typeA*)Qprev.Even_p(), (typeA*)Qprev.Odd_p(), \
862 (typeB*)link.Even_p(), (typeB*)link.Odd_p(), \
864 (typeA*)P3.Even_p(), (typeA*)P3.Odd_p(), \
865 (typeA*)newOprod.Even_p(), (typeA*)newOprod.Odd_p(), \
868 #define CALL_MIDDLE_LINK_KERNEL(sig_sign, mu_sign) \
869 if(oddness_change == 0){ \
870 if(sizeof(RealA) == sizeof(float2)){ \
871 if(recon == QUDA_RECONSTRUCT_NO){ \
872 do_lepage_middle_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float2); \
873 do_lepage_middle_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float2); \
875 do_lepage_middle_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float4); \
876 do_lepage_middle_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float4); \
879 if(recon == QUDA_RECONSTRUCT_NO){ \
880 do_lepage_middle_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \
881 do_lepage_middle_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \
883 do_lepage_middle_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \
884 do_lepage_middle_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \
888 if(sizeof(RealA) == sizeof(float2)){ \
889 if(recon == QUDA_RECONSTRUCT_NO){ \
890 do_lepage_middle_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float2); \
891 do_lepage_middle_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float2); \
893 do_lepage_middle_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float4); \
894 do_lepage_middle_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float4); \
897 if(recon == QUDA_RECONSTRUCT_NO){ \
898 do_lepage_middle_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \
899 do_lepage_middle_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \
901 do_lepage_middle_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \
902 do_lepage_middle_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \
907 void apply(
const cudaStream_t &stream) {
910 int oddness_change = (kparam.base_idx[0] + kparam.base_idx[1]
911 + kparam.base_idx[2] + kparam.base_idx[3])&1;
914 CALL_MIDDLE_LINK_KERNEL(1,1);
916 CALL_MIDDLE_LINK_KERNEL(1,0);
918 CALL_MIDDLE_LINK_KERNEL(0,1);
920 CALL_MIDDLE_LINK_KERNEL(0,0);
925 #undef CALL_ARGUMENTS
926 #undef CALL_MIDDLE_LINK_KERNEL
938 long long flops()
const {
939 if(
GOES_FORWARDS(
sig))
return 810*kparam.X[0]*kparam.X[1]*kparam.X[2]*kparam.X[3];
940 return kparam.X[0]*kparam.X[1]*kparam.X[2]*kparam.X[3]*396;
944 template<
class RealA,
class RealB>
945 class SideLink :
public Tunable {
948 const cudaGaugeField &link;
949 const cudaGaugeField &
P3;
950 const cudaGaugeField &oprod;
953 const typename RealTypeId<RealA>::Type &
coeff;
955 cudaGaugeField &shortP;
956 cudaGaugeField &newOprod;
957 const hisq_kernel_param_t &
kparam;
959 unsigned int sharedBytesPerThread()
const {
return 0; }
960 unsigned int sharedBytesPerBlock(
const TuneParam &)
const {
return 0; }
963 bool tuneGridDim()
const {
return false; }
964 unsigned int minThreads()
const {
return kparam.threads; }
967 SideLink(
const cudaGaugeField &link,
968 const cudaGaugeField &
P3,
969 const cudaGaugeField &oprod,
971 const typename RealTypeId<RealA>::Type &coeff,
973 cudaGaugeField &shortP,
974 cudaGaugeField &newOprod,
975 const hisq_kernel_param_t &kparam) :
976 link(link), P3(P3), oprod(oprod),
977 sig(sig), mu(mu), coeff(coeff), accumu_coeff(accumu_coeff),
978 shortP(shortP), newOprod(newOprod), kparam(kparam)
980 virtual ~SideLink() { ; }
982 TuneKey tuneKey()
const {
983 std::stringstream vol, aux;
984 vol << kparam.D[0] <<
"x";
985 vol << kparam.D[1] <<
"x";
986 vol << kparam.D[2] <<
"x";
988 aux <<
"threads=" << kparam.threads <<
",prec=" << link.Precision();
989 aux <<
",recon=" << link.Reconstruct() <<
",sig=" <<
sig <<
",mu=" <<
mu;
990 return TuneKey(vol.str().c_str(),
typeid(*this).name(), aux.str().c_str());
993 #define CALL_ARGUMENTS(typeA, typeB) <<<tp.grid, tp.block>>> \
994 ((typeA*)P3.Even_p(), (typeA*)P3.Odd_p(), \
995 (typeA*)oprod.Even_p(), (typeA*)oprod.Odd_p(), \
996 (typeB*)link.Even_p(), (typeB*)link.Odd_p(), \
999 (typename RealTypeId<typeA>::Type) accumu_coeff, \
1000 (typeA*)shortP.Even_p(), (typeA*)shortP.Odd_p(), \
1001 (typeA*)newOprod.Even_p(), (typeA*)newOprod.Odd_p(), \
1004 #define CALL_SIDE_LINK_KERNEL(sig_sign, mu_sign) \
1005 if(oddness_change == 0){ \
1006 if(sizeof(RealA) == sizeof(float2)){ \
1007 if(recon == QUDA_RECONSTRUCT_NO){ \
1008 do_side_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float2); \
1009 do_side_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float2); \
1011 do_side_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float4); \
1012 do_side_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float4); \
1015 if(recon == QUDA_RECONSTRUCT_NO){ \
1016 do_side_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \
1017 do_side_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \
1019 do_side_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \
1020 do_side_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \
1024 if(sizeof(RealA) == sizeof(float2)){ \
1025 if(recon == QUDA_RECONSTRUCT_NO){ \
1026 do_side_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float2); \
1027 do_side_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float2); \
1029 do_side_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float4); \
1030 do_side_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float4); \
1033 if(recon == QUDA_RECONSTRUCT_NO){ \
1034 do_side_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \
1035 do_side_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \
1037 do_side_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \
1038 do_side_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \
1043 void apply(
const cudaStream_t &stream) {
1046 int oddness_change = (kparam.base_idx[0] + kparam.base_idx[1]
1047 + kparam.base_idx[2] + kparam.base_idx[3])&1;
1050 CALL_SIDE_LINK_KERNEL(1,1);
1052 CALL_SIDE_LINK_KERNEL(1,0);
1054 CALL_SIDE_LINK_KERNEL(0,1);
1056 CALL_SIDE_LINK_KERNEL(0,0);
1060 #undef CALL_SIDE_LINK_KERNEL
1061 #undef CALL_ARGUMENTS
1073 long long flops()
const {
return 0; }
1077 template<
class RealA,
class RealB>
1078 class SideLinkShort :
public Tunable {
1081 const cudaGaugeField &link;
1082 const cudaGaugeField &
P3;
1085 const typename RealTypeId<RealA>::Type &
coeff;
1086 cudaGaugeField &newOprod;
1087 const hisq_kernel_param_t &
kparam;
1089 unsigned int sharedBytesPerThread()
const {
return 0; }
1090 unsigned int sharedBytesPerBlock(
const TuneParam &)
const {
return 0; }
1093 bool tuneGridDim()
const {
return false; }
1094 unsigned int minThreads()
const {
return kparam.threads; }
1097 SideLinkShort(
const cudaGaugeField &link,
const cudaGaugeField &
P3,
int sig,
int mu,
1098 const typename RealTypeId<RealA>::Type &coeff, cudaGaugeField &newOprod,
1099 const hisq_kernel_param_t &kparam) :
1100 link(link), P3(P3), sig(sig), mu(mu), coeff(coeff), newOprod(newOprod), kparam(kparam)
1102 virtual ~SideLinkShort() { ; }
1104 TuneKey tuneKey()
const {
1105 std::stringstream vol, aux;
1106 vol << kparam.D[0] <<
"x";
1107 vol << kparam.D[1] <<
"x";
1108 vol << kparam.D[2] <<
"x";
1110 aux <<
"threads=" << kparam.threads <<
",prec=" << link.Precision();
1111 aux <<
",recon=" << link.Reconstruct() <<
",sig=" <<
sig <<
",mu=" <<
mu;
1112 return TuneKey(vol.str().c_str(),
typeid(*this).name(), aux.str().c_str());
1115 #define CALL_ARGUMENTS(typeA, typeB) <<<tp.grid, tp.block>>> \
1116 ((typeA*)P3.Even_p(), (typeA*)P3.Odd_p(), \
1117 (typeB*)link.Even_p(), (typeB*)link.Odd_p(), \
1118 sig, mu, (typename RealTypeId<typeA>::Type) coeff, \
1119 (typeA*)newOprod.Even_p(), (typeA*)newOprod.Odd_p(), kparam)
1122 #define CALL_SIDE_LINK_KERNEL(sig_sign, mu_sign) \
1123 if(oddness_change == 0){ \
1124 if(sizeof(RealA) == sizeof(float2)){ \
1125 if(recon == QUDA_RECONSTRUCT_NO){ \
1126 do_side_link_short_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float2); \
1127 do_side_link_short_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float2); \
1129 do_side_link_short_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float4); \
1130 do_side_link_short_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float4); \
1133 if(recon == QUDA_RECONSTRUCT_NO){ \
1134 do_side_link_short_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \
1135 do_side_link_short_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \
1137 do_side_link_short_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \
1138 do_side_link_short_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \
1142 if(sizeof(RealA) == sizeof(float2)){ \
1143 if(recon == QUDA_RECONSTRUCT_NO){ \
1144 do_side_link_short_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float2); \
1145 do_side_link_short_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float2); \
1147 do_side_link_short_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float4); \
1148 do_side_link_short_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float4); \
1151 if(recon == QUDA_RECONSTRUCT_NO){ \
1152 do_side_link_short_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \
1153 do_side_link_short_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \
1155 do_side_link_short_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \
1156 do_side_link_short_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \
1161 void apply(
const cudaStream_t &stream) {
1164 int oddness_change = (kparam.base_idx[0] + kparam.base_idx[1]
1165 + kparam.base_idx[2] + kparam.base_idx[3])&1;
1168 CALL_SIDE_LINK_KERNEL(1,1);
1170 CALL_SIDE_LINK_KERNEL(1,0);
1173 CALL_SIDE_LINK_KERNEL(0,1);
1175 CALL_SIDE_LINK_KERNEL(0,0);
1179 #undef CALL_SIDE_LINK_KERNEL
1180 #undef CALL_ARGUMENTS
1191 long long flops()
const {
return 0; }
1194 template<
class RealA,
class RealB>
1195 class AllLink :
public Tunable {
1198 const cudaGaugeField &link;
1199 const cudaGaugeField &oprod;
1200 const cudaGaugeField &Qprev;
1203 const typename RealTypeId<RealA>::Type &
coeff;
1205 cudaGaugeField &shortP;
1206 cudaGaugeField &newOprod;
1207 const hisq_kernel_param_t &
kparam;
1209 unsigned int sharedBytesPerThread()
const {
return 0; }
1210 unsigned int sharedBytesPerBlock(
const TuneParam &)
const {
return 0; }
1213 bool tuneGridDim()
const {
return false; }
1214 unsigned int minThreads()
const {
return kparam.threads; }
1217 AllLink(
const cudaGaugeField &link,
1218 const cudaGaugeField &oprod,
1219 const cudaGaugeField &Qprev,
1221 const typename RealTypeId<RealA>::Type &coeff,
1223 cudaGaugeField &shortP, cudaGaugeField &newOprod,
1224 const hisq_kernel_param_t &kparam) :
1225 link(link), oprod(oprod), Qprev(Qprev), sig(sig), mu(mu),
1226 coeff(coeff), accumu_coeff(accumu_coeff), shortP(shortP),
1227 newOprod(newOprod), kparam(kparam)
1229 virtual ~AllLink() { ; }
1231 TuneKey tuneKey()
const {
1232 std::stringstream vol, aux;
1233 vol << kparam.D[0] <<
"x";
1234 vol << kparam.D[1] <<
"x";
1235 vol << kparam.D[2] <<
"x";
1237 aux <<
"threads=" << kparam.threads <<
",prec=" << link.Precision();
1238 aux <<
",recon=" << link.Reconstruct() <<
",sig=" <<
sig <<
",mu=" <<
mu;
1239 return TuneKey(vol.str().c_str(),
typeid(*this).name(), aux.str().c_str());
1242 #define CALL_ARGUMENTS(typeA, typeB) <<<tp.grid, tp.block>>> \
1243 ((typeA*)oprod.Even_p(), (typeA*)oprod.Odd_p(), \
1244 (typeA*)Qprev.Even_p(), (typeA*)Qprev.Odd_p(), \
1245 (typeB*)link.Even_p(), (typeB*)link.Odd_p(), sig, mu, \
1246 (typename RealTypeId<typeA>::Type)coeff, \
1247 (typename RealTypeId<typeA>::Type)accumu_coeff, \
1248 (typeA*)shortP.Even_p(),(typeA*)shortP.Odd_p(), \
1249 (typeA*)newOprod.Even_p(), (typeA*)newOprod.Odd_p(), kparam)
1251 #define CALL_ALL_LINK_KERNEL(sig_sign, mu_sign) \
1252 if(oddness_change == 0){ \
1253 if(sizeof(RealA) == sizeof(float2)){ \
1254 if(recon == QUDA_RECONSTRUCT_NO){ \
1255 do_all_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float2); \
1256 do_all_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float2); \
1258 do_all_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float4); \
1259 do_all_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float4); \
1262 if(recon == QUDA_RECONSTRUCT_NO){ \
1263 do_all_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \
1264 do_all_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \
1266 do_all_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \
1267 do_all_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \
1271 if(sizeof(RealA) == sizeof(float2)){ \
1272 if(recon == QUDA_RECONSTRUCT_NO){ \
1273 do_all_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float2); \
1274 do_all_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float2); \
1276 do_all_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float4); \
1277 do_all_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float4); \
1280 if(recon == QUDA_RECONSTRUCT_NO){ \
1281 do_all_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \
1282 do_all_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \
1284 do_all_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \
1285 do_all_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \
1289 void apply(
const cudaStream_t &stream) {
1292 int oddness_change = (kparam.base_idx[0] + kparam.base_idx[1]
1293 + kparam.base_idx[2] + kparam.base_idx[3])&1;
1296 CALL_ALL_LINK_KERNEL(1, 1);
1298 CALL_ALL_LINK_KERNEL(1, 0);
1300 CALL_ALL_LINK_KERNEL(0, 1);
1302 CALL_ALL_LINK_KERNEL(0, 0);
1308 #undef CALL_ARGUMENTS
1309 #undef CALL_ALL_LINK_KERNEL
1321 virtual void initTuneParam(TuneParam ¶m)
const
1324 param.grid = dim3((kparam.threads+param.block.x-1)/param.block.x, 1, 1);
1328 void defaultTuneParam(TuneParam ¶m)
const
1331 param.grid = dim3((kparam.threads+param.block.x-1)/param.block.x, 1, 1);
1334 long long flops()
const {
1335 if(
GOES_FORWARDS(
sig))
return kparam.X[0]*kparam.X[1]*kparam.X[2]*kparam.X[3]*1242;
1337 return kparam.X[0]*kparam.X[1]*kparam.X[2]*kparam.X[3]*828;
1342 template<
class RealA,
class RealB>
1343 class OneLinkTerm :
public Tunable {
1346 const cudaGaugeField &oprod;
1347 const typename RealTypeId<RealA>::Type &
coeff;
1348 cudaGaugeField &ForceMatrix;
1350 hisq_kernel_param_t
kparam;
1352 unsigned int sharedBytesPerThread()
const {
return 0; }
1353 unsigned int sharedBytesPerBlock(
const TuneParam &)
const {
return 0; }
1356 bool tuneGridDim()
const {
return false; }
1357 unsigned int minThreads()
const {
return X[0]*X[1]*X[2]*X[3]/2; }
1360 OneLinkTerm(
const cudaGaugeField &oprod,
1361 const typename RealTypeId<RealA>::Type &coeff,
1363 oprod(oprod), coeff(coeff), ForceMatrix(ForceMatrix)
1365 for(
int dir=0; dir<4; ++dir) X[dir] = param.
X[dir];
1367 kparam.threads = X[0]*X[1]*X[2]*X[3]/2;
1368 for(
int dir=0; dir<4; ++dir){
1369 kparam.X[dir] = X[dir];
1371 kparam.setStride(param);
1374 virtual ~OneLinkTerm() { ; }
1376 TuneKey tuneKey()
const {
1377 std::stringstream vol, aux;
1382 int threads = X[0]*X[1]*X[2]*X[3]/2;
1383 aux <<
"threads=" << threads <<
",prec=" << oprod.Precision();
1384 aux <<
",coeff=" <<
coeff;
1385 return TuneKey(vol.str().c_str(),
typeid(*this).name(), aux.str().c_str());
1388 void apply(
const cudaStream_t &stream) {
1394 do_one_link_term_kernel<RealA,0><<<tp.grid,tp.block>>>(
static_cast<const RealA*
>(oprod.Even_p()),
1395 static_cast<const RealA*>(oprod.Odd_p()),
1397 static_cast<RealA*>(ForceMatrix.Even_p()),
1398 static_cast<RealA*>(ForceMatrix.Odd_p()),
1400 do_one_link_term_kernel<RealA,1><<<tp.grid,tp.block>>>(
static_cast<const RealA*
>(oprod.Even_p()),
1401 static_cast<const RealA*>(oprod.Odd_p()),
1403 static_cast<RealA*>(ForceMatrix.Even_p()),
1404 static_cast<RealA*>(ForceMatrix.Odd_p()),
1410 ForceMatrix.backup();
1414 ForceMatrix.restore();
1417 long long flops()
const {
1418 return 72*kparam.X[0]*kparam.X[1]*kparam.X[2]*kparam.X[3];
1423 template<
class RealA,
class RealB>
1424 class LongLinkTerm :
public Tunable {
1427 const cudaGaugeField &link;
1428 const cudaGaugeField &naikOprod;
1429 const typename RealTypeId<RealA>::Type naik_coeff;
1430 cudaGaugeField &output;
1432 const hisq_kernel_param_t &
kparam;
1434 unsigned int sharedBytesPerThread()
const {
return 0; }
1435 unsigned int sharedBytesPerBlock(
const TuneParam &)
const {
return 0; }
1438 bool tuneGridDim()
const {
return false; }
1439 unsigned int minThreads()
const {
return X[0]*X[1]*X[2]*X[3]/2; }
1442 LongLinkTerm(
const cudaGaugeField &link,
const cudaGaugeField &naikOprod,
1443 const typename RealTypeId<RealA>::Type &naik_coeff,
1444 cudaGaugeField &output,
const hisq_kernel_param_t &kparam) :
1445 link(link), naikOprod(naikOprod), naik_coeff(naik_coeff), output(output),
1447 {
for(
int dir=0; dir<4; ++dir) X[dir] = kparam.X[dir]; }
1449 virtual ~LongLinkTerm() { ; }
1451 TuneKey tuneKey()
const {
1452 std::stringstream vol, aux;
1457 int threads = X[0]*X[1]*X[2]*X[3]/2;
1458 aux <<
"threads=" << threads <<
",prec=" << link.Precision();
1459 return TuneKey(vol.str().c_str(),
typeid(*this).name(), aux.str().c_str());
1462 #define CALL_ARGUMENTS(typeA, typeB) <<<tp.grid,tp.block>>> \
1463 ((typeB*)link.Even_p(), (typeB*)link.Odd_p(), \
1464 (typeA*)naikOprod.Even_p(), (typeA*)naikOprod.Odd_p(), \
1466 (typeA*)output.Even_p(), (typeA*)output.Odd_p(), \
1469 void apply(
const cudaStream_t &stream) {
1473 if(
sizeof(RealA) ==
sizeof(float2)){
1475 do_longlink_sp_18_kernel<float2,float2, 0> CALL_ARGUMENTS(float2, float2);
1476 do_longlink_sp_18_kernel<float2,float2, 1> CALL_ARGUMENTS(float2, float2);
1478 do_longlink_sp_12_kernel<float2,float4, 0> CALL_ARGUMENTS(float2, float4);
1479 do_longlink_sp_12_kernel<float2,float4, 1> CALL_ARGUMENTS(float2, float4);
1483 do_longlink_dp_18_kernel<double2,double2, 0> CALL_ARGUMENTS(double2, double2);
1484 do_longlink_dp_18_kernel<double2,double2, 1> CALL_ARGUMENTS(double2, double2);
1486 do_longlink_dp_12_kernel<double2,double2, 0> CALL_ARGUMENTS(double2, double2);
1487 do_longlink_dp_12_kernel<double2,double2, 1> CALL_ARGUMENTS(double2, double2);
1492 #undef CALL_ARGUMENTS
1502 long long flops()
const {
return 4968*kparam.X[0]*kparam.X[1]*kparam.X[2]*kparam.X[3]; }
1508 template<
class RealA,
class RealB>
1509 class CompleteForce :
public Tunable {
1512 const cudaGaugeField &link;
1513 const cudaGaugeField &oprod;
1514 cudaGaugeField &mom;
1516 hisq_kernel_param_t
kparam;
1518 unsigned int sharedBytesPerThread()
const {
return 0; }
1519 unsigned int sharedBytesPerBlock(
const TuneParam &)
const {
return 0; }
1522 bool tuneGridDim()
const {
return false; }
1523 unsigned int minThreads()
const {
return X[0]*X[1]*X[2]*X[3]/2; }
1526 CompleteForce(
const cudaGaugeField &link,
const cudaGaugeField &oprod,
1528 link(link), oprod(oprod), mom(mom)
1531 for(
int dir=0; dir<4; ++dir){
1532 X[dir] = param.
X[dir];
1533 kparam.X[dir] = X[dir];
1535 kparam.threads = X[0]*X[1]*X[2]*X[3]/2;
1536 kparam.setStride(param);
1539 virtual ~CompleteForce() { ; }
1541 TuneKey tuneKey()
const {
1542 std::stringstream vol, aux;
1547 int threads = X[0]*X[1]*X[2]*X[3]/2;
1548 aux <<
"threads=" << threads <<
",prec=" << link.Precision();
1549 return TuneKey(vol.str().c_str(),
typeid(*this).name(), aux.str().c_str());
1552 #define CALL_ARGUMENTS(typeA, typeB) <<<tp.grid, tp.block>>> \
1553 ((typeB*)link.Even_p(), (typeB*)link.Odd_p(), \
1554 (typeA*)oprod.Even_p(), (typeA*)oprod.Odd_p(), \
1555 (typeA*)mom.Even_p(), (typeA*)mom.Odd_p(), \
1558 void apply(
const cudaStream_t &stream) {
1565 if(
sizeof(RealA) ==
sizeof(float2)){
1567 do_complete_force_sp_18_kernel<float2,float2, 0> CALL_ARGUMENTS(float2, float2);
1568 do_complete_force_sp_18_kernel<float2,float2, 1> CALL_ARGUMENTS(float2, float2);
1570 do_complete_force_sp_12_kernel<float2,float4, 0> CALL_ARGUMENTS(float2, float4);
1571 do_complete_force_sp_12_kernel<float2,float4, 1> CALL_ARGUMENTS(float2, float4);
1575 do_complete_force_dp_18_kernel<double2,double2, 0> CALL_ARGUMENTS(double2, double2);
1576 do_complete_force_dp_18_kernel<double2,double2, 1> CALL_ARGUMENTS(double2, double2);
1578 do_complete_force_dp_12_kernel<double2,double2, 0> CALL_ARGUMENTS(double2, double2);
1579 do_complete_force_dp_12_kernel<double2,double2, 1> CALL_ARGUMENTS(double2, double2);
1584 #undef CALL_ARGUMENTS
1594 long long flops()
const {
1595 return kparam.X[0]*kparam.X[1]*kparam.X[2]*kparam.X[3]*792;
1601 bind_tex_link(
const cudaGaugeField& link,
const cudaGaugeField& newOprod)
1604 cudaBindTexture(0, thinLink0TexDouble, link.Even_p(), link.Bytes()/2);
1605 cudaBindTexture(0, thinLink1TexDouble, link.Odd_p(), link.Bytes()/2);
1607 cudaBindTexture(0, newOprod0TexDouble, newOprod.Even_p(), newOprod.Bytes()/2);
1608 cudaBindTexture(0, newOprod1TexDouble, newOprod.Odd_p(), newOprod.Bytes()/2);
1611 cudaBindTexture(0, thinLink0TexSingle, link.Even_p(), link.Bytes()/2);
1612 cudaBindTexture(0, thinLink1TexSingle, link.Odd_p(), link.Bytes()/2);
1614 cudaBindTexture(0, thinLink0TexSingle_recon, link.Even_p(), link.Bytes()/2);
1615 cudaBindTexture(0, thinLink1TexSingle_recon, link.Odd_p(), link.Bytes()/2);
1617 cudaBindTexture(0, newOprod0TexSingle, newOprod.Even_p(), newOprod.Bytes()/2);
1618 cudaBindTexture(0, newOprod1TexSingle, newOprod.Odd_p(), newOprod.Bytes()/2);
1624 unbind_tex_link(
const cudaGaugeField& link,
const cudaGaugeField& newOprod)
1627 cudaUnbindTexture(thinLink0TexDouble);
1628 cudaUnbindTexture(thinLink1TexDouble);
1629 cudaUnbindTexture(newOprod0TexDouble);
1630 cudaUnbindTexture(newOprod1TexDouble);
1633 cudaUnbindTexture(thinLink0TexSingle);
1634 cudaUnbindTexture(thinLink1TexSingle);
1636 cudaUnbindTexture(thinLink0TexSingle_recon);
1637 cudaUnbindTexture(thinLink1TexSingle_recon);
1639 cudaUnbindTexture(newOprod0TexSingle);
1640 cudaUnbindTexture(newOprod1TexSingle);
1644 template<
class Real,
class RealA,
class RealB>
1648 const cudaGaugeField &oprod,
1649 const cudaGaugeField &link,
1650 cudaGaugeField &
Pmu,
1653 cudaGaugeField &
Pnumu,
1654 cudaGaugeField &
Qmu,
1655 cudaGaugeField &
Qnumu,
1656 cudaGaugeField &newOprod)
1660 Real OneLink, Lepage, FiveSt, ThreeSt, SevenSt;
1661 Real mLepage, mFiveSt, mThreeSt;
1663 OneLink = act_path_coeff.
one;
1664 ThreeSt = act_path_coeff.
three; mThreeSt = -ThreeSt;
1665 FiveSt = act_path_coeff.
five; mFiveSt = -FiveSt;
1666 SevenSt = act_path_coeff.
seven;
1667 Lepage = act_path_coeff.
lepage; mLepage = -Lepage;
1671 OneLinkTerm<RealA, RealB> oneLink(oprod, OneLink, newOprod, param);
1683 hisq_kernel_param_t kparam_1g, kparam_2g;
1685 for(
int dir=0; dir<4; ++dir){
1686 kparam_1g.X[dir] = param.
X[dir];
1687 kparam_2g.X[dir] = param.
X[dir];
1690 kparam_1g.setStride(param);
1691 kparam_2g.setStride(param);
1699 kparam_1g.D1h = kparam_1g.D[0]/2;
1704 kparam_1g.threads = kparam_1g.D[0]*kparam_1g.D[1]*kparam_1g.D[2]*kparam_1g.D[3]/2;
1710 kparam_2g.D1h = kparam_2g.D[0]/2;
1715 kparam_2g.threads = kparam_2g.D[0]*kparam_2g.D[1]*kparam_2g.D[2]*kparam_2g.D[3]/2;
1718 for(
int i=0;i < 4; i++){
1719 kparam_1g.ghostDim[i] = kparam_2g.ghostDim[i]=kparam_1g.ghostDim[i]=kparam_2g.ghostDim[i] = ghostDim[i];
1722 hisq_kernel_param_t
kparam;
1723 kparam.D[0] = param.
X[0];
1724 kparam.D[1] = param.
X[1];
1725 kparam.D[2] = param.
X[2];
1726 kparam.D[3] = param.
X[3];
1727 kparam.D1h = param.
X[0]/2;
1728 kparam.threads=param.
X[0]*param.
X[1]*param.
X[2]*param.
X[3]/2;
1729 kparam.base_idx[0]=0;
1730 kparam.base_idx[1]=0;
1731 kparam.base_idx[2]=0;
1732 kparam.base_idx[3]=0;
1733 kparam_2g.threads = kparam_1g.threads = kparam.threads;
1735 for(
int i=0; i<4; ++i){
1736 kparam_2g.D[i] = kparam_1g.D[i] = kparam.D[i];
1737 kparam_2g.D1h = kparam_1g.D1h = kparam.D1h;
1738 kparam_2g.base_idx[i] = kparam_1g.base_idx[i] = 0;
1739 kparam_2g.ghostDim[i] = kparam_1g.ghostDim[i] = 0;
1743 for(
int mu=0; mu<8; mu++){
1750 MiddleLink<RealA,RealB> middleLink( link, oprod,
1753 newOprod, kparam_2g);
1754 middleLink.apply(0);
1757 for(
int nu=0; nu < 8; nu++){
1759 || nu == mu || nu ==
OPP_DIR(mu)){
1764 MiddleLink<RealA,RealB> middleLink( link, Pmu, Qmu,
1767 newOprod, kparam_1g);
1768 middleLink.apply(0);
1771 for(
int rho = 0; rho < 8; rho++){
1773 || rho == mu || rho ==
OPP_DIR(mu)
1774 || rho == nu || rho ==
OPP_DIR(nu)){
1779 if(FiveSt != 0)coeff = SevenSt/FiveSt;
else coeff = 0;
1780 AllLink<RealA,RealB> allLink(link, Pnumu, Qnumu,
sig, rho, SevenSt, coeff,
1781 P5, newOprod, kparam_1g);
1790 if(ThreeSt != 0)coeff = FiveSt/ThreeSt;
else coeff = 0;
1791 SideLink<RealA,RealB> sideLink(link, P5, Qmu,
1792 sig, nu, mFiveSt, coeff,
1794 newOprod, kparam_1g);
1802 LepageMiddleLink<RealA,RealB>
1803 lepageMiddleLink ( link, Pmu, Qmu,
1806 newOprod, kparam_2g);
1807 lepageMiddleLink.apply(0);
1810 if(ThreeSt != 0)coeff = Lepage/ThreeSt ;
else coeff = 0;
1812 SideLink<RealA, RealB> sideLink(link, P5, Qmu,
1813 sig, mu, mLepage, coeff,
1815 newOprod, kparam_2g);
1823 SideLinkShort<RealA,RealB> sideLinkShort(link, P3,
1825 newOprod, kparam_1g);
1826 sideLinkShort.apply(0);
1845 const cudaGaugeField &oprod,
1846 const cudaGaugeField &link,
1847 cudaGaugeField* force,
1850 bind_tex_link(link, oprod);
1853 CompleteForce<double2,double2> completeForce(link, oprod, *force, param);
1854 completeForce.apply(0);
1855 if(flops) *flops = completeForce.flops();
1858 CompleteForce<float2,float2> completeForce(link, oprod, *force, param);
1859 completeForce.apply(0);
1860 if(flops) *flops = completeForce.flops();
1867 unbind_tex_link(link, oprod);
1874 const cudaGaugeField &oldOprod,
1875 const cudaGaugeField &link,
1876 cudaGaugeField *newOprod,
1879 bind_tex_link(link, *newOprod);
1880 const int volume = param.
X[0]*param.
X[1]*param.
X[2]*param.
X[3];
1881 hisq_kernel_param_t
kparam;
1882 for(
int i=0; i<4; i++){
1883 kparam.X[i] = param.
X[i];
1886 kparam.threads = volume/2;
1887 kparam.setStride(param);
1890 LongLinkTerm<double2,double2> longLink(link, oldOprod, coeff, *newOprod, kparam);
1892 if(flops) (*flops) = longLink.flops();
1895 LongLinkTerm<float2,float2> longLink(link, oldOprod, static_cast<float>(coeff), *newOprod, kparam);
1897 if(flops) (*flops) = longLink.flops();
1902 unbind_tex_link(link, *newOprod);
1913 const cudaGaugeField &oprod,
1914 const cudaGaugeField &link,
1915 cudaGaugeField* newOprod,
1921 param.
X[0]+4, param.
X[1]+4, param.
X[2]+4, param.
X[3]+4
1925 param.
X[0], param.
X[1], param.
X[2], param.
X[3]
1943 bind_tex_link(link, *newOprod);
1945 cudaEvent_t start,
end;
1947 cudaEventCreate(&start);
1948 cudaEventCreate(&end);
1950 cudaEventRecord(start);
1954 act_path_coeff.
one = path_coeff_array[0];
1955 act_path_coeff.
naik = path_coeff_array[1];
1956 act_path_coeff.
three = path_coeff_array[2];
1957 act_path_coeff.
five = path_coeff_array[3];
1958 act_path_coeff.
seven = path_coeff_array[4];
1959 act_path_coeff.
lepage = path_coeff_array[5];
1960 do_hisq_staples_force_cuda<double,double2,double2>( act_path_coeff,
1975 act_path_coeff.
one = path_coeff_array[0];
1976 act_path_coeff.
naik = path_coeff_array[1];
1977 act_path_coeff.
three = path_coeff_array[2];
1978 act_path_coeff.
five = path_coeff_array[3];
1979 act_path_coeff.
seven = path_coeff_array[4];
1980 act_path_coeff.
lepage = path_coeff_array[5];
1982 do_hisq_staples_force_cuda<float,float2,float2>( act_path_coeff,
1998 cudaEventRecord(end);
1999 cudaEventSynchronize(end);
2001 cudaEventElapsedTime(&runtime, start, end);
2004 int volume = param.
X[0]*param.
X[1]*param.
X[2]*param.
X[3];
2006 *flops = (134784 + 24192 + 103680 + 864 + 397440 + 72);
2008 if(path_coeff_array[5] != 0.) *flops += 28944;
2012 unbind_tex_link(link, *newOprod);
2014 cudaEventDestroy(start);
2015 cudaEventDestroy(end);
2023 #endif // GPU_HISQ_FORCE
QudaGaugeParam gauge_param
__device__ __host__ int linkIndex(int x[], int dx[], const int X[4])
int commDimPartitioned(int dir)
QudaVerbosity getVerbosity()
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int mu
#define HISQ_LOAD_MATRIX_12_SINGLE_TEX(gauge, dir, idx, var, stride)
addMatrixToField(Ow.data, point_d, accumu_coeff, shortPEven, shortPOdd, 1-oddBit, kparam.color_matrix_stride)
__global__ void const FloatN FloatM FloatM Float Float int threads
void mat(void *out, void **fatlink, void **longlink, void *in, double kappa, int dagger_bit, QudaPrecision sPrecision, QudaPrecision gPrecision)
#define HISQ_LOAD_MATRIX_18_SINGLE_TEX(gauge, dir, idx, var, stride)
#define HISQ_LOAD_MATRIX_12_DOUBLE_TEX(gauge_tex, gauge, dir, idx, var, stride)
virtual void initTuneParam(TuneParam ¶m) const
__inline__ __device__ double fetch_double(texture< int2, 1 > t, int i)
__device__ __host__ Cmplx Conj(const Cmplx &a)
__global__ void const RealB *const const RealA *const oprodEven
cudaColorSpinorField * tmp
addMatrixToNewOprod(Ow.data, OPP_DIR(mu), new_sid, mycoeff, newOprodEven, newOprodOdd, oddBit, kparam.color_matrix_stride)
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const linkOdd
__global__ void const RealB *const const RealA *const const RealA *const RealTypeId< RealA >::Type RealA *const outputEven
void hisqCompleteForceCuda(const QudaGaugeParam ¶m, const cudaGaugeField &oprod, const cudaGaugeField &link, cudaGaugeField *force, long long *flops=NULL)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
__constant__ double coeff
__global__ void const RealB *const const RealA *const const RealA *const RealTypeId< RealA >::Type RealA *const RealA *const outputOdd
updateCoords(y, mymu,(mu_positive?-1:1), kparam.X, kparam.ghostDim[mymu])
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealTypeId< RealA >::Type accumu_coeff
__inline__ __device__ double2 fetch_double2(texture< int4, 1 > t, int i)
#define READ_DOUBLE2_TEXTURE(x_tex, x, i)
virtual void defaultTuneParam(TuneParam ¶m) const
void hisqLongLinkForceCuda(double coeff, const QudaGaugeParam ¶m, const cudaGaugeField &oprod, const cudaGaugeField &link, cudaGaugeField *newOprod, long long *flops=NULL)
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int RealTypeId< RealA >::Type RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const RealA *const hisq_kernel_param_t kparam
void hisqStaplesForceCuda(const double path_coeff[6], const QudaGaugeParam ¶m, const cudaGaugeField &oprod, const cudaGaugeField &link, cudaGaugeField *newOprod, long long *flops=NULL)
storeMatrixToField(Oy.data, new_sid, P3Even, P3Odd, oddBit, kparam.color_matrix_stride)
#define GOES_BACKWARDS(dir)
__host__ __device__ complex< ValueType > operator*(const complex< ValueType > &lhs, const complex< ValueType > &rhs)
enum QudaReconstructType_s QudaReconstructType
__host__ __device__ float4 operator+=(float4 &x, const float4 y)
#define GOES_FORWARDS(dir)
#define HISQ_LOAD_MATRIX_18_DOUBLE_TEX(gauge_tex, gauge, dir, idx, var, stride)
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int sig
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const linkEven
loadMatrixFromField(oprodEven, oprodOdd, point_c, Oy.data, oddBit, kparam.color_matrix_stride)
#define TEX1DFETCH(type, tex, idx)
__global__ void const RealA *const oprodOdd
__device__ __host__ void getCoords(int x[4], int cb_index, const int X[4], int parity)