10 #define COMPILE_HISQ_DP_18
11 #define COMPILE_HISQ_DP_12
12 #define COMPILE_HISQ_SP_18
13 #define COMPILE_HISQ_SP_12
16 #define HISQ_SITE_MATRIX_LOAD_TEX 1
17 #define HISQ_NEW_OPROD_LOAD_TEX 1
20 namespace fermion_force {
37 static int hisq_force_init_cuda_flag = 0;
39 if (hisq_force_init_cuda_flag){
42 hisq_force_init_cuda_flag=1;
44 int Vh = param->
X[0]*param->
X[1]*param->
X[2]*param->
X[3]/2;
48 int Vh_ex = (param->
X[0]+4)*(param->
X[1]+4)*(param->
X[2]+4)*(param->
X[3]+4)/2;
79 inline __device__ float2
operator*(
float a,
const float2 & b)
81 return make_float2(a*b.x,a*b.y);
84 inline __device__ double2
operator*(
double a,
const double2 & b)
86 return make_double2(a*b.x,a*b.y);
89 inline __device__
const float2 &
operator+=(float2 & a,
const float2 & b)
96 inline __device__
const double2 &
operator+=(double2 & a,
const double2 & b)
103 inline __device__
const float4 &
operator+=(float4 & a,
const float4 & b)
136 #define CONJ_INDEX(i,j) j*3 + i
157 template<
int N,
class T>
162 const T*
const field = (oddness)?field_odd:field_even;
163 for(
int i = 0;i < N ;i++){
164 mat[i] = field[idx + dir*N*stride + i*
stride];
184 const float4*
const field = oddness?field_odd: field_even;
186 tmp = field[idx + dir*stride*3];
187 mat[0] = make_float2(tmp.x, tmp.y);
188 mat[1] = make_float2(tmp.z, tmp.w);
189 tmp = field[idx + dir*stride*3 +
stride];
190 mat[2] = make_float2(tmp.x, tmp.y);
191 mat[3] = make_float2(tmp.z, tmp.w);
192 tmp = field[idx + dir*stride*3 + 2*
stride];
193 mat[4] = make_float2(tmp.x, tmp.y);
194 mat[5] = make_float2(tmp.z, tmp.w);
202 const T*
const field = (oddness)?field_odd:field_even;
204 mat[1] = field[idx +
stride];
205 mat[2] = field[idx + stride*2];
206 mat[3] = field[idx + stride*3];
207 mat[4] = field[idx + stride*4];
208 mat[5] = field[idx + stride*5];
209 mat[6] = field[idx + stride*6];
210 mat[7] = field[idx + stride*7];
211 mat[8] = field[idx + stride*8];
217 #define addMatrixToNewOprod(mat, dir, idx, coeff, field_even, field_odd, oddness) do { \
218 RealA* const field = (oddness)?field_odd: field_even; \
220 value[0] = LOAD_TEX_ENTRY( ((oddness)?NEWOPROD_ODD_TEX:NEWOPROD_EVEN_TEX), field, idx+dir*hf.color_matrix_stride*9); \
221 value[1] = LOAD_TEX_ENTRY( ((oddness)?NEWOPROD_ODD_TEX:NEWOPROD_EVEN_TEX), field, idx+dir*hf.color_matrix_stride*9 + hf.color_matrix_stride); \
222 value[2] = LOAD_TEX_ENTRY( ((oddness)?NEWOPROD_ODD_TEX:NEWOPROD_EVEN_TEX), field, idx+dir*hf.color_matrix_stride*9 + 2*hf.color_matrix_stride); \
223 value[3] = LOAD_TEX_ENTRY( ((oddness)?NEWOPROD_ODD_TEX:NEWOPROD_EVEN_TEX), field, idx+dir*hf.color_matrix_stride*9 + 3*hf.color_matrix_stride); \
224 value[4] = LOAD_TEX_ENTRY( ((oddness)?NEWOPROD_ODD_TEX:NEWOPROD_EVEN_TEX), field, idx+dir*hf.color_matrix_stride*9 + 4*hf.color_matrix_stride); \
225 value[5] = LOAD_TEX_ENTRY( ((oddness)?NEWOPROD_ODD_TEX:NEWOPROD_EVEN_TEX), field, idx+dir*hf.color_matrix_stride*9 + 5*hf.color_matrix_stride); \
226 value[6] = LOAD_TEX_ENTRY( ((oddness)?NEWOPROD_ODD_TEX:NEWOPROD_EVEN_TEX), field, idx+dir*hf.color_matrix_stride*9 + 6*hf.color_matrix_stride); \
227 value[7] = LOAD_TEX_ENTRY( ((oddness)?NEWOPROD_ODD_TEX:NEWOPROD_EVEN_TEX), field, idx+dir*hf.color_matrix_stride*9 + 7*hf.color_matrix_stride); \
228 value[8] = LOAD_TEX_ENTRY( ((oddness)?NEWOPROD_ODD_TEX:NEWOPROD_EVEN_TEX), field, idx+dir*hf.color_matrix_stride*9 + 8*hf.color_matrix_stride); \
229 field[idx + dir*hf.color_matrix_stride*9] = value[0] + coeff*mat[0]; \
230 field[idx + dir*hf.color_matrix_stride*9 + hf.color_matrix_stride] = value[1] + coeff*mat[1]; \
231 field[idx + dir*hf.color_matrix_stride*9 + hf.color_matrix_stride*2] = value[2] + coeff*mat[2]; \
232 field[idx + dir*hf.color_matrix_stride*9 + hf.color_matrix_stride*3] = value[3] + coeff*mat[3]; \
233 field[idx + dir*hf.color_matrix_stride*9 + hf.color_matrix_stride*4] = value[4] + coeff*mat[4]; \
234 field[idx + dir*hf.color_matrix_stride*9 + hf.color_matrix_stride*5] = value[5] + coeff*mat[5]; \
235 field[idx + dir*hf.color_matrix_stride*9 + hf.color_matrix_stride*6] = value[6] + coeff*mat[6]; \
236 field[idx + dir*hf.color_matrix_stride*9 + hf.color_matrix_stride*7] = value[7] + coeff*mat[7]; \
237 field[idx + dir*hf.color_matrix_stride*9 + hf.color_matrix_stride*8] = value[8] + coeff*mat[8]; \
244 template<
class T,
class U>
247 T*
const field_even, T*
const field_odd,
int oddness)
249 T*
const field = (oddness)?field_odd: field_even;
264 template<
class T,
class U>
267 T*
const field_odd,
int oddness)
269 T*
const field = (oddness)?field_odd: field_even;
270 field[
idx ] += coeff*mat[0];
283 template<
class T,
class U>
286 T*
const field_odd,
int oddness)
288 T*
const field = (oddness)?field_odd: field_even;
290 field[
idx ] += coeff*mat[0];
300 #if (!defined(__CUDA_ARCH__) || (__COMPUTE_CAPABILITY__>=200))
301 printf(
"value is coeff(%f) * mat[0].x(%f)=%f\n", coeff, mat[0].
x, field[idx].x);
310 T*
const field = (oddness)?field_odd: field_even;
329 T*
const field = (oddness)?field_odd: field_even;
344 template<
class T,
class U>
347 T*
const mom_even, T*
const mom_odd,
int oddness)
349 T*
const mom_field = (oddness)?mom_odd:mom_even;
351 temp2.x = (mat[1].x - mat[3].x)*0.5*coeff;
352 temp2.y = (mat[1].y + mat[3].y)*0.5*coeff;
355 temp2.x = (mat[2].x - mat[6].x)*0.5*coeff;
356 temp2.y = (mat[2].y + mat[6].y)*0.5*coeff;
359 temp2.x = (mat[5].x - mat[7].x)*0.5*coeff;
360 temp2.y = (mat[5].y + mat[7].y)*0.5*coeff;
363 const typename RealTypeId<T>::Type temp = (mat[0].y + mat[4].y + mat[8].y)*0.3333333333333333333333333;
364 temp2.x = (mat[0].y-temp)*coeff;
365 temp2.y = (mat[4].y-temp)*coeff;
368 temp2.x = (mat[8].y - temp)*coeff;
376 template<
int pos_dir,
int odd_lattice>
400 template<
int odd_lattice>
412 template<
class RealX>
438 if( (i[3]&1)==1) *sign=-1;
442 if( ((i[3]+i[0])&1) == 1) *sign=-1;
446 if( ((i[3]+i[0]+i[1])&1) == 1) *sign=-1;
452 || (i[3] == 1 &&
Pt0)) {
456 if(i[3] ==
X4m1) *sign=-1;
461 #if (!defined(__CUDA_ARCH__) || (__COMPUTE_CAPABILITY__>=200))
462 printf(
"Error: invalid dir\n");
474 template<
class RealA,
int oddBit>
480 int sid = blockIdx.x * blockDim.x + threadIdx.x;
481 if (sid >= threads)
return;
490 int x1odd = (x[1] + x[2] + x[3] + oddBit) & 1;
491 x[0] = 2*x1h +
x1odd;
507 #define DD_CONCAT(n,r) n ## r ## kernel
509 #define HISQ_KERNEL_NAME(a,b) DD_CONCAT(a,b)
512 #define NEWOPROD_EVEN_TEX newOprod0TexDouble
513 #define NEWOPROD_ODD_TEX newOprod1TexDouble
514 #if (HISQ_NEW_OPROD_LOAD_TEX == 1)
515 #define LOAD_TEX_ENTRY(tex, field, idx) READ_DOUBLE2_TEXTURE(tex, field, idx)
517 #define LOAD_TEX_ENTRY(tex, field, idx) field[idx]
523 #if (HISQ_SITE_MATRIX_LOAD_TEX == 1)
524 #define HISQ_LOAD_LINK(linkEven, linkOdd, dir, idx, var, oddness) HISQ_LOAD_MATRIX_18_DOUBLE_TEX((oddness)?siteLink1TexDouble:siteLink0TexDouble, (oddness)?linkOdd:linkEven, dir, idx, var, hf.site_ga_stride)
526 #define HISQ_LOAD_LINK(linkEven, linkOdd, dir, idx, var, oddness) loadMatrixFromField(linkEven, linkOdd, dir, idx, var, oddness, hf.site_ga_stride)
528 #define COMPUTE_LINK_SIGN(sign, dir, x)
529 #define RECONSTRUCT_SITE_LINK(var, sign)
533 #undef HISQ_LOAD_LINK
534 #undef COMPUTE_LINK_SIGN
535 #undef RECONSTRUCT_SITE_LINK
540 #if (HISQ_SITE_MATRIX_LOAD_TEX == 1)
541 #define HISQ_LOAD_LINK(linkEven, linkOdd, dir, idx, var, oddness) HISQ_LOAD_MATRIX_12_DOUBLE_TEX((oddness)?siteLink1TexDouble:siteLink0TexDouble, (oddness)?linkOdd:linkEven,dir, idx, var, hf.site_ga_stride)
543 #define HISQ_LOAD_LINK(linkEven, linkOdd, dir, idx, var, oddness) loadMatrixFromField<6>(linkEven, linkOdd, dir, idx, var, oddness, hf.site_ga_stride)
545 #define COMPUTE_LINK_SIGN(sign, dir, x) reconstructSign(sign, dir, x)
546 #define RECONSTRUCT_SITE_LINK(var, sign) FF_RECONSTRUCT_LINK_12(var, sign)
550 #undef HISQ_LOAD_LINK
551 #undef COMPUTE_LINK_SIGN
552 #undef RECONSTRUCT_SITE_LINK
553 #undef NEWOPROD_EVEN_TEX
554 #undef NEWOPROD_ODD_TEX
555 #undef LOAD_TEX_ENTRY
558 #define NEWOPROD_EVEN_TEX newOprod0TexSingle
559 #define NEWOPROD_ODD_TEX newOprod1TexSingle
561 #if (HISQ_NEW_OPROD_LOAD_TEX==1)
562 #define LOAD_TEX_ENTRY(tex, field, idx) tex1Dfetch(tex,idx)
564 #define LOAD_TEX_ENTRY(tex, field, idx) field[idx]
570 #if (HISQ_SITE_MATRIX_LOAD_TEX == 1)
571 #define HISQ_LOAD_LINK(linkEven, linkOdd, dir, idx, var, oddness) HISQ_LOAD_MATRIX_18_SINGLE_TEX((oddness)?siteLink1TexSingle:siteLink0TexSingle, dir, idx, var, hf.site_ga_stride)
573 #define HISQ_LOAD_LINK(linkEven, linkOdd, dir, idx, var, oddness) loadMatrixFromField(linkEven, linkOdd, dir, idx, var, oddness, hf.site_ga_stride)
575 #define COMPUTE_LINK_SIGN(sign, dir, x)
576 #define RECONSTRUCT_SITE_LINK(var, sign)
580 #undef HISQ_LOAD_LINK
581 #undef COMPUTE_LINK_SIGN
582 #undef RECONSTRUCT_SITE_LINK
587 #if (HISQ_SITE_MATRIX_LOAD_TEX == 1)
588 #define HISQ_LOAD_LINK(linkEven, linkOdd, dir, idx, var, oddness) HISQ_LOAD_MATRIX_12_SINGLE_TEX((oddness)?siteLink1TexSingle_recon:siteLink0TexSingle_recon, dir, idx, var, hf.site_ga_stride)
590 #define HISQ_LOAD_LINK(linkEven, linkOdd, dir, idx, var, oddness) loadMatrixFromField(linkEven, linkOdd, dir, idx, var, oddness, hf.site_ga_stride)
592 #define COMPUTE_LINK_SIGN(sign, dir, x) reconstructSign(sign, dir, x)
593 #define RECONSTRUCT_SITE_LINK(var, sign) FF_RECONSTRUCT_LINK_12(var, sign)
597 #undef HISQ_LOAD_LINK
598 #undef COMPUTE_LINK_SIGN
599 #undef RECONSTRUCT_SITE_LINK
600 #undef NEWOPROD_EVEN_TEX
601 #undef NEWOPROD_ODD_TEX
602 #undef LOAD_TEX_ENTRY
607 template<
class RealA,
class RealB>
623 int sharedBytesPerThread()
const {
return 0; }
624 int sharedBytesPerBlock(
const TuneParam &)
const {
return 0; }
628 bool advanceGridDim(TuneParam ¶m)
const {
return false; }
631 bool advanceBlockDim(TuneParam ¶m)
const
633 const unsigned int max_threads =
deviceProp.maxThreadsDim[0];
634 const unsigned int max_blocks =
deviceProp.maxGridSize[0];
635 const unsigned int max_shared = 16384;
638 param.block.x += step;
639 if (param.block.x > max_threads || sharedBytesPerThread()*param.block.x > max_shared) {
640 param.block = dim3((kparam.
threads+max_blocks-1)/max_blocks, 1, 1);
641 param.block.x = ((param.block.x+step-1) / step) * step;
642 if (param.block.x > max_threads)
errorQuda(
"Local lattice volume is too large for device");
647 param.grid = dim3((kparam.
threads+param.block.x-1)/param.block.x, 1, 1);
662 link(link), oprod(oprod), Qprev(Qprev), sig(sig), mu(mu),
663 coeff(coeff), Pmu(Pmu), P3(P3), Qmu(Qmu), newOprod(newOprod), kparam(kparam)
675 link(link), oprod(oprod), Qprev(link), sig(sig), mu(mu),
676 coeff(coeff), Pmu(Pmu), P3(P3), Qmu(Qmu), newOprod(newOprod), kparam(kparam)
681 std::stringstream vol, aux;
682 vol << kparam.
D1 <<
"x";
683 vol << kparam.
D2 <<
"x";
684 vol << kparam.
D3 <<
"x";
687 aux <<
",recon=" << link.
Reconstruct() <<
",sig=" << sig <<
",mu=" << mu;
688 return TuneKey(vol.str(),
typeid(*this).name(), aux.str());
692 #define CALL_ARGUMENTS(typeA, typeB) <<<tp.grid, tp.block>>> \
693 ((typeA*)oprod.Even_p(), (typeA*)oprod.Odd_p(), \
694 (typeA*)Qprev_even, (typeA*)Qprev_odd, \
695 (typeB*)link.Even_p(), (typeB*)link.Odd_p(), \
697 (typeA*)Pmu.Even_p(), (typeA*)Pmu.Odd_p(), \
698 (typeA*)P3.Even_p(), (typeA*)P3.Odd_p(), \
699 (typeA*)Qmu.Even_p(), (typeA*)Qmu.Odd_p(), \
700 (typeA*)newOprod.Even_p(), (typeA*)newOprod.Odd_p(), kparam)
703 #define CALL_MIDDLE_LINK_KERNEL(sig_sign, mu_sign) \
704 if(oddness_change ==0 ){ \
705 if(sizeof(RealA) == sizeof(float2)){ \
706 if(recon == QUDA_RECONSTRUCT_NO){ \
707 do_middle_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float2); \
708 do_middle_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float2); \
710 do_middle_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float4); \
711 do_middle_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float4); \
714 if(recon == QUDA_RECONSTRUCT_NO){ \
715 do_middle_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \
716 do_middle_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \
718 do_middle_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \
719 do_middle_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \
723 if(sizeof(RealA) == sizeof(float2)){ \
724 if(recon == QUDA_RECONSTRUCT_NO){ \
725 do_middle_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float2); \
726 do_middle_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float2); \
728 do_middle_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float4); \
729 do_middle_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float4); \
732 if(recon == QUDA_RECONSTRUCT_NO){ \
733 do_middle_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \
734 do_middle_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \
736 do_middle_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \
737 do_middle_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \
750 const void *Qprev_even = (&Qprev == &link) ? NULL : Qprev.
Even_p();
751 const void *Qprev_odd = (&Qprev == &link) ? NULL : Qprev.
Odd_p();
765 #undef CALL_ARGUMENTS
766 #undef CALL_MIDDLE_LINK_KERNEL
784 const unsigned int max_threads =
deviceProp.maxThreadsDim[0];
785 const unsigned int max_blocks =
deviceProp.maxGridSize[0];
788 param.
block.x = ((param.
block.x+step-1) / step) * step;
789 if (param.
block.x > max_threads)
errorQuda(
"Local lattice volume is too large for device");
791 param.
shared_bytes = sharedBytesPerThread()*param.
block.x > sharedBytesPerBlock(param) ?
792 sharedBytesPerThread()*param.
block.x : sharedBytesPerBlock(param);
800 long long flops()
const {
return 0; }
804 template<
class RealA,
class RealB>
818 int sharedBytesPerThread()
const {
return 0; }
819 int sharedBytesPerBlock(
const TuneParam &)
const {
return 0; }
822 bool advanceGridDim(
TuneParam ¶m)
const {
return false; }
823 bool advanceBlockDim(
TuneParam ¶m)
const {
837 link(link), oprod(oprod), Qprev(Qprev), sig(sig), mu(mu),
838 coeff(coeff), P3(P3), newOprod(newOprod), kparam(kparam)
843 std::stringstream vol, aux;
844 vol << kparam.
D1 <<
"x";
845 vol << kparam.
D2 <<
"x";
846 vol << kparam.
D3 <<
"x";
849 aux <<
",recon=" << link.
Reconstruct() <<
",sig=" << sig <<
",mu=" << mu;
850 return TuneKey(vol.str(),
typeid(*this).name(), aux.str());
853 #define CALL_ARGUMENTS(typeA, typeB) <<<tp.grid, tp.block>>> \
854 ((typeA*)oprod.Even_p(), (typeA*)oprod.Odd_p(), \
855 (typeA*)Qprev.Even_p(), (typeA*)Qprev.Odd_p(), \
856 (typeB*)link.Even_p(), (typeB*)link.Odd_p(), \
858 (typeA*)P3.Even_p(), (typeA*)P3.Odd_p(), \
859 (typeA*)newOprod.Even_p(), (typeA*)newOprod.Odd_p(), \
862 #define CALL_MIDDLE_LINK_KERNEL(sig_sign, mu_sign) \
863 if(oddness_change == 0){ \
864 if(sizeof(RealA) == sizeof(float2)){ \
865 if(recon == QUDA_RECONSTRUCT_NO){ \
866 do_lepage_middle_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float2); \
867 do_lepage_middle_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float2); \
869 do_lepage_middle_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float4); \
870 do_lepage_middle_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float4); \
873 if(recon == QUDA_RECONSTRUCT_NO){ \
874 do_lepage_middle_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \
875 do_lepage_middle_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \
877 do_lepage_middle_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \
878 do_lepage_middle_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \
882 if(sizeof(RealA) == sizeof(float2)){ \
883 if(recon == QUDA_RECONSTRUCT_NO){ \
884 do_lepage_middle_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float2); \
885 do_lepage_middle_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float2); \
887 do_lepage_middle_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float4); \
888 do_lepage_middle_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float4); \
891 if(recon == QUDA_RECONSTRUCT_NO){ \
892 do_lepage_middle_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \
893 do_lepage_middle_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \
895 do_lepage_middle_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \
896 do_lepage_middle_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \
920 #undef CALL_ARGUMENTS
921 #undef CALL_MIDDLE_LINK_KERNEL
946 long long flops()
const {
return 0; }
949 template<
class RealA,
class RealB>
964 int sharedBytesPerThread()
const {
return 0; }
965 int sharedBytesPerBlock(
const TuneParam &)
const {
return 0; }
968 bool advanceGridDim(
TuneParam ¶m)
const {
return false; }
969 bool advanceBlockDim(
TuneParam ¶m)
const {
985 link(link), P3(P3), oprod(oprod),
986 sig(sig), mu(mu), coeff(coeff), accumu_coeff(accumu_coeff),
987 shortP(shortP), newOprod(newOprod), kparam(kparam)
992 std::stringstream vol, aux;
993 vol << kparam.
D1 <<
"x";
994 vol << kparam.
D2 <<
"x";
995 vol << kparam.
D3 <<
"x";
998 aux <<
",recon=" << link.
Reconstruct() <<
",sig=" << sig <<
",mu=" << mu;
999 return TuneKey(vol.str(),
typeid(*this).name(), aux.str());
1002 #define CALL_ARGUMENTS(typeA, typeB) <<<tp.grid, tp.block>>> \
1003 ((typeA*)P3.Even_p(), (typeA*)P3.Odd_p(), \
1004 (typeA*)oprod.Even_p(), (typeA*)oprod.Odd_p(), \
1005 (typeB*)link.Even_p(), (typeB*)link.Odd_p(), \
1008 (typename RealTypeId<typeA>::Type) accumu_coeff, \
1009 (typeA*)shortP.Even_p(), (typeA*)shortP.Odd_p(), \
1010 (typeA*)newOprod.Even_p(), (typeA*)newOprod.Odd_p(), \
1013 #define CALL_SIDE_LINK_KERNEL(sig_sign, mu_sign) \
1014 if(oddness_change == 0){ \
1015 if(sizeof(RealA) == sizeof(float2)){ \
1016 if(recon == QUDA_RECONSTRUCT_NO){ \
1017 do_side_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float2); \
1018 do_side_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float2); \
1020 do_side_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float4); \
1021 do_side_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float4); \
1024 if(recon == QUDA_RECONSTRUCT_NO){ \
1025 do_side_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \
1026 do_side_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \
1028 do_side_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \
1029 do_side_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \
1033 if(sizeof(RealA) == sizeof(float2)){ \
1034 if(recon == QUDA_RECONSTRUCT_NO){ \
1035 do_side_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float2); \
1036 do_side_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float2); \
1038 do_side_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float4); \
1039 do_side_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float4); \
1042 if(recon == QUDA_RECONSTRUCT_NO){ \
1043 do_side_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \
1044 do_side_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \
1046 do_side_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \
1047 do_side_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \
1069 #undef CALL_SIDE_LINK_KERNEL
1070 #undef CALL_ARGUMENTS
1099 template<
class RealA,
class RealB>
1111 int sharedBytesPerThread()
const {
return 0; }
1112 int sharedBytesPerBlock(
const TuneParam &)
const {
return 0; }
1115 bool advanceGridDim(
TuneParam ¶m)
const {
return false; }
1116 bool advanceBlockDim(
TuneParam ¶m)
const {
1126 link(link), P3(P3), sig(sig), mu(mu), coeff(coeff), newOprod(newOprod), kparam(kparam)
1131 std::stringstream vol, aux;
1132 vol << kparam.
D1 <<
"x";
1133 vol << kparam.
D2 <<
"x";
1134 vol << kparam.
D3 <<
"x";
1137 aux <<
",recon=" << link.
Reconstruct() <<
",sig=" << sig <<
",mu=" << mu;
1138 return TuneKey(vol.str(),
typeid(*this).name(), aux.str());
1141 #define CALL_ARGUMENTS(typeA, typeB) <<<tp.grid, tp.block>>> \
1142 ((typeA*)P3.Even_p(), (typeA*)P3.Odd_p(), \
1143 (typeB*)link.Even_p(), (typeB*)link.Odd_p(), \
1144 sig, mu, (typename RealTypeId<typeA>::Type) coeff, \
1145 (typeA*)newOprod.Even_p(), (typeA*)newOprod.Odd_p(), kparam)
1148 #define CALL_SIDE_LINK_KERNEL(sig_sign, mu_sign) \
1149 if(oddness_change == 0){ \
1150 if(sizeof(RealA) == sizeof(float2)){ \
1151 if(recon == QUDA_RECONSTRUCT_NO){ \
1152 do_side_link_short_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float2); \
1153 do_side_link_short_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float2); \
1155 do_side_link_short_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float4); \
1156 do_side_link_short_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float4); \
1159 if(recon == QUDA_RECONSTRUCT_NO){ \
1160 do_side_link_short_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \
1161 do_side_link_short_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \
1163 do_side_link_short_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \
1164 do_side_link_short_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \
1168 if(sizeof(RealA) == sizeof(float2)){ \
1169 if(recon == QUDA_RECONSTRUCT_NO){ \
1170 do_side_link_short_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float2); \
1171 do_side_link_short_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float2); \
1173 do_side_link_short_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float4); \
1174 do_side_link_short_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float4); \
1177 if(recon == QUDA_RECONSTRUCT_NO){ \
1178 do_side_link_short_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \
1179 do_side_link_short_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \
1181 do_side_link_short_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \
1182 do_side_link_short_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \
1187 void apply(
const cudaStream_t &stream) {
1205 #undef CALL_SIDE_LINK_KERNEL
1206 #undef CALL_ARGUMENTS
1233 template<
class RealA,
class RealB>
1248 int sharedBytesPerThread()
const {
return 0; }
1249 int sharedBytesPerBlock(
const TuneParam &)
const {
return 0; }
1252 bool advanceGridDim(
TuneParam ¶m)
const {
return false; }
1253 bool advanceBlockDim(
TuneParam ¶m)
const {
1268 link(link), oprod(oprod), Qprev(Qprev), sig(sig), mu(mu),
1269 coeff(coeff), accumu_coeff(accumu_coeff), shortP(shortP),
1270 newOprod(newOprod), kparam(kparam)
1275 std::stringstream vol, aux;
1276 vol << kparam.
D1 <<
"x";
1277 vol << kparam.
D2 <<
"x";
1278 vol << kparam.
D3 <<
"x";
1281 aux <<
",recon=" << link.
Reconstruct() <<
",sig=" << sig <<
",mu=" << mu;
1282 return TuneKey(vol.str(),
typeid(*this).name(), aux.str());
1285 #define CALL_ARGUMENTS(typeA, typeB) <<<tp.grid, tp.block>>> \
1286 ((typeA*)oprod.Even_p(), (typeA*)oprod.Odd_p(), \
1287 (typeA*)Qprev.Even_p(), (typeA*)Qprev.Odd_p(), \
1288 (typeB*)link.Even_p(), (typeB*)link.Odd_p(), sig, mu, \
1289 (typename RealTypeId<typeA>::Type)coeff, \
1290 (typename RealTypeId<typeA>::Type)accumu_coeff, \
1291 (typeA*)shortP.Even_p(),(typeA*)shortP.Odd_p(), \
1292 (typeA*)newOprod.Even_p(), (typeA*)newOprod.Odd_p(), kparam)
1294 #define CALL_ALL_LINK_KERNEL(sig_sign, mu_sign) \
1295 if(oddness_change == 0){ \
1296 if(sizeof(RealA) == sizeof(float2)){ \
1297 if(recon == QUDA_RECONSTRUCT_NO){ \
1298 do_all_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float2); \
1299 do_all_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float2); \
1301 do_all_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(float2, float4); \
1302 do_all_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(float2, float4); \
1305 if(recon == QUDA_RECONSTRUCT_NO){ \
1306 do_all_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \
1307 do_all_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \
1309 do_all_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 0> CALL_ARGUMENTS(double2, double2); \
1310 do_all_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 0> CALL_ARGUMENTS(double2, double2); \
1314 if(sizeof(RealA) == sizeof(float2)){ \
1315 if(recon == QUDA_RECONSTRUCT_NO){ \
1316 do_all_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float2); \
1317 do_all_link_sp_18_kernel<float2, float2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float2); \
1319 do_all_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(float2, float4); \
1320 do_all_link_sp_12_kernel<float2, float4, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(float2, float4); \
1323 if(recon == QUDA_RECONSTRUCT_NO){ \
1324 do_all_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \
1325 do_all_link_dp_18_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \
1327 do_all_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 0, 1> CALL_ARGUMENTS(double2, double2); \
1328 do_all_link_dp_12_kernel<double2, double2, sig_sign, mu_sign, 1, 1> CALL_ARGUMENTS(double2, double2); \
1332 void apply(
const cudaStream_t &stream) {
1351 #undef CALL_ARGUMENTS
1352 #undef CALL_ALL_LINK_KERNEL
1381 template<
class RealA,
class RealB>
1391 int sharedBytesPerThread()
const {
return 0; }
1392 int sharedBytesPerBlock(
const TuneParam &)
const {
return 0; }
1395 bool advanceGridDim(
TuneParam ¶m)
const {
return false; }
1396 bool advanceBlockDim(
TuneParam ¶m)
const {
1398 int threads = X[0]*X[1]*X[2]*X[3]/2;
1400 param.
grid = dim3((threads + param.
block.x-1)/param.
block.x, 1, 1);
1408 oprod(oprod), sig(sig), coeff(coeff), ForceMatrix(ForceMatrix), X(_X)
1414 std::stringstream vol, aux;
1419 int threads = X[0]*X[1]*X[2]*X[3]/2;
1420 aux <<
"threads=" << threads <<
",prec=" << oprod.
Precision();
1421 aux <<
",sig=" << sig <<
",coeff=" << coeff;
1422 return TuneKey(vol.str(),
typeid(*this).name(), aux.str());
1425 void apply(
const cudaStream_t &stream) {
1428 int threads = X[0]*X[1]*X[2]*X[3]/2;
1431 do_one_link_term_kernel<RealA,0><<<tp.
grid,tp.
block>>>(
static_cast<const RealA*
>(oprod.
Even_p()),
1432 static_cast<const RealA*>(oprod.
Odd_p()),
1434 static_cast<RealA*>(ForceMatrix.
Even_p()),
1435 static_cast<RealA*>(ForceMatrix.
Odd_p()),
1437 do_one_link_term_kernel<RealA,1><<<tp.
grid,tp.
block>>>(
static_cast<const RealA*
>(oprod.
Even_p()),
1438 static_cast<const RealA*>(oprod.
Odd_p()),
1440 static_cast<RealA*>(ForceMatrix.
Even_p()),
1441 static_cast<RealA*>(ForceMatrix.
Odd_p()),
1459 int threads = X[0]*X[1]*X[2]*X[3]/2;
1460 param.
grid = dim3((threads+param.
block.x-1)/param.
block.x, 1, 1);
1467 int threads = X[0]*X[1]*X[2]*X[3]/2;
1468 param.
grid = dim3((threads+param.
block.x-1)/param.
block.x, 1, 1);
1475 template<
class RealA,
class RealB>
1487 int sharedBytesPerThread()
const {
return 0; }
1488 int sharedBytesPerBlock(
const TuneParam &)
const {
return 0; }
1491 bool advanceGridDim(
TuneParam ¶m)
const {
return false; }
1492 bool advanceBlockDim(
TuneParam ¶m)
const {
1494 int threads = X[0]*X[1]*X[2]*X[3]/2;
1495 param.
grid = dim3((threads + param.
block.x-1)/param.
block.x, 1, 1);
1503 link(link), naikOprod(naikOprod), sig(sig), naik_coeff(naik_coeff), output(output),
1504 X(_X), kparam(kparam)
1510 std::stringstream vol, aux;
1515 int threads = X[0]*X[1]*X[2]*X[3]/2;
1516 aux <<
"threads=" << threads <<
",prec=" << link.
Precision();
1517 aux <<
",sig=" << sig;
1518 return TuneKey(vol.str(),
typeid(*this).name(), aux.str());
1521 #define CALL_ARGUMENTS(typeA, typeB) <<<tp.grid,tp.block>>> \
1522 ((typeB*)link.Even_p(), (typeB*)link.Odd_p(), \
1523 (typeA*)naikOprod.Even_p(), (typeA*)naikOprod.Odd_p(), \
1525 (typeA*)output.Even_p(), (typeA*)output.Odd_p(), \
1528 void apply(
const cudaStream_t &stream) {
1533 if(
sizeof(RealA) ==
sizeof(float2)){
1535 do_longlink_sp_18_kernel<float2,float2, 0>
CALL_ARGUMENTS(float2, float2);
1536 do_longlink_sp_18_kernel<float2,float2, 1>
CALL_ARGUMENTS(float2, float2);
1538 do_longlink_sp_12_kernel<float2,float4, 0>
CALL_ARGUMENTS(float2, float4);
1539 do_longlink_sp_12_kernel<float2,float4, 1>
CALL_ARGUMENTS(float2, float4);
1543 do_longlink_dp_18_kernel<double2,double2, 0>
CALL_ARGUMENTS(double2, double2);
1544 do_longlink_dp_18_kernel<double2,double2, 1>
CALL_ARGUMENTS(double2, double2);
1546 do_longlink_dp_12_kernel<double2,double2, 0>
CALL_ARGUMENTS(double2, double2);
1547 do_longlink_dp_12_kernel<double2,double2, 1>
CALL_ARGUMENTS(double2, double2);
1552 #undef CALL_ARGUMENTS
1565 int threads = X[0]*X[1]*X[2]*X[3]/2;
1566 param.
grid = dim3((threads+param.
block.x-1)/param.
block.x, 1, 1);
1573 int threads = X[0]*X[1]*X[2]*X[3]/2;
1574 param.
grid = dim3((threads+param.
block.x-1)/param.
block.x, 1, 1);
1583 template<
class RealA,
class RealB>
1593 int sharedBytesPerThread()
const {
return 0; }
1594 int sharedBytesPerBlock(
const TuneParam &)
const {
return 0; }
1597 bool advanceGridDim(
TuneParam ¶m)
const {
return false; }
1598 bool advanceBlockDim(
TuneParam ¶m)
const {
1600 int threads = X[0]*X[1]*X[2]*X[3]/2;
1601 param.
grid = dim3((threads + param.
block.x-1)/param.
block.x, 1, 1);
1608 link(link), oprod(oprod), sig(sig), mom(mom), X(_X)
1614 std::stringstream vol, aux;
1619 int threads = X[0]*X[1]*X[2]*X[3]/2;
1620 aux <<
"threads=" << threads <<
",prec=" << link.
Precision() <<
",sig=" << sig;
1621 return TuneKey(vol.str(),
typeid(*this).name(), aux.str());
1624 #define CALL_ARGUMENTS(typeA, typeB) <<<tp.grid, tp.block>>> \
1625 ((typeB*)link.Even_p(), (typeB*)link.Odd_p(), \
1626 (typeA*)oprod.Even_p(), (typeA*)oprod.Odd_p(), \
1628 (typeA*)mom.Even_p(), (typeA*)mom.Odd_p(), \
1629 X[0] * X[1] * X[2] * X[3]/2);
1631 void apply(
const cudaStream_t &stream) {
1635 if(
sizeof(RealA) ==
sizeof(float2)){
1637 do_complete_force_sp_18_kernel<float2,float2, 0>
CALL_ARGUMENTS(float2, float2);
1638 do_complete_force_sp_18_kernel<float2,float2, 1>
CALL_ARGUMENTS(float2, float2);
1640 do_complete_force_sp_12_kernel<float2,float4, 0>
CALL_ARGUMENTS(float2, float4);
1641 do_complete_force_sp_12_kernel<float2,float4, 1>
CALL_ARGUMENTS(float2, float4);
1645 do_complete_force_dp_18_kernel<double2,double2, 0>
CALL_ARGUMENTS(double2, double2);
1646 do_complete_force_dp_18_kernel<double2,double2, 1>
CALL_ARGUMENTS(double2, double2);
1648 do_complete_force_dp_12_kernel<double2,double2, 0>
CALL_ARGUMENTS(double2, double2);
1649 do_complete_force_dp_12_kernel<double2,double2, 1>
CALL_ARGUMENTS(double2, double2);
1654 #undef CALL_ARGUMENTS
1667 int threads = X[0]*X[1]*X[2]*X[3]/2;
1668 param.
grid = dim3((threads+param.
block.x-1)/param.
block.x, 1, 1);
1675 int threads = X[0]*X[1]*X[2]*X[3]/2;
1676 param.
grid = dim3((threads+param.
block.x-1)/param.
block.x, 1, 1);
1707 unbind_tex_link(
const cudaGaugeField& link,
const cudaGaugeField& newOprod)
1727 template<
class Real,
class RealA,
class RealB>
1731 const cudaGaugeField &oprod,
1732 const cudaGaugeField &link,
1733 cudaGaugeField &
Pmu,
1736 cudaGaugeField &
Pnumu,
1737 cudaGaugeField &
Qmu,
1738 cudaGaugeField &
Qnumu,
1739 cudaGaugeField &newOprod)
1743 Real OneLink, Lepage, FiveSt, ThreeSt, SevenSt;
1744 Real mLepage, mFiveSt, mThreeSt;
1746 OneLink = act_path_coeff.
one;
1747 ThreeSt = act_path_coeff.
three; mThreeSt = -ThreeSt;
1748 FiveSt = act_path_coeff.
five; mFiveSt = -FiveSt;
1749 SevenSt = act_path_coeff.
seven;
1750 Lepage = act_path_coeff.
lepage; mLepage = -Lepage;
1756 OneLinkTerm<RealA, RealB> oneLink(oprod,
sig, OneLink, newOprod, param.
X);
1778 kparam_1g.D1h = kparam_1g.D1/2;
1783 kparam_1g.threads = kparam_1g.D1*kparam_1g.D2*kparam_1g.D3*kparam_1g.D4/2;
1789 kparam_2g.D1h = kparam_2g.D1/2;
1794 kparam_2g.threads = kparam_2g.D1*kparam_2g.D2*kparam_2g.D3*kparam_2g.D4/2;
1797 for(
int i=0;i < 4; i++){
1798 kparam_1g.ghostDim[i] = kparam_2g.ghostDim[i]=kparam_1g.ghostDim[i]=kparam_2g.ghostDim[i] = ghostDim[i];
1802 kparam.D1 = param.
X[0];
1803 kparam.D2 = param.
X[1];
1804 kparam.D3 = param.
X[2];
1805 kparam.D4 = param.
X[3];
1806 kparam.D1h = param.
X[0]/2;
1807 kparam.threads=param.
X[0]*param.
X[1]*param.
X[2]*param.
X[3]/2;
1808 kparam.base_idx[0]=0;
1809 kparam.base_idx[1]=0;
1810 kparam.base_idx[2]=0;
1811 kparam.base_idx[3]=0;
1812 kparam_2g = kparam_1g =
kparam;
1816 for(
int mu=0;
mu<8;
mu++){
1823 MiddleLink<RealA,RealB> middleLink( link, oprod,
1826 newOprod, kparam_2g);
1827 middleLink.apply(0);
1830 for(
int nu=0; nu < 8; nu++){
1837 MiddleLink<RealA,RealB> middleLink( link, Pmu, Qmu,
1840 newOprod, kparam_1g);
1841 middleLink.apply(0);
1844 for(
int rho = 0; rho < 8; rho++){
1847 || rho == nu || rho ==
OPP_DIR(nu)){
1851 if(FiveSt != 0)coeff = SevenSt/FiveSt;
else coeff = 0;
1852 AllLink<RealA,RealB> allLink(link, Pnumu, Qnumu,
sig, rho, SevenSt, coeff,
1853 P5, newOprod, kparam_1g);
1862 if(ThreeSt != 0)coeff = FiveSt/ThreeSt;
else coeff = 0;
1863 SideLink<RealA,RealB> sideLink(link, P5, Qmu,
1864 sig, nu, mFiveSt, coeff,
1866 newOprod, kparam_1g);
1874 LepageMiddleLink<RealA,RealB>
1875 lepageMiddleLink ( link, Pmu, Qmu,
1878 newOprod, kparam_2g);
1879 lepageMiddleLink.apply(0);
1882 if(ThreeSt != 0)coeff = Lepage/ThreeSt ;
else coeff = 0;
1884 SideLink<RealA, RealB> sideLink(link, P5, Qmu,
1885 sig,
mu, mLepage, coeff,
1887 newOprod, kparam_2g);
1895 SideLinkShort<RealA,RealB> sideLinkShort(link, P3,
1897 newOprod, kparam_1g);
1898 sideLinkShort.apply(0);
1922 bind_tex_link(link, oprod);
1927 completeForce.
apply(0);
1931 completeForce.
apply(0);
1938 unbind_tex_link(link, oprod);
1949 bind_tex_link(link, *newOprod);
1950 const int volume = param.
X[0]*param.
X[1]*param.
X[2]*param.
X[3];
1952 for(
int i =0;i < 4;i++){
1964 *newOprod, param.
X, kparam);
1971 unbind_tex_link(link, *newOprod);
1989 param.
X[0]+4, param.
X[1]+4, param.
X[2]+4, param.
X[3]+4
1993 param.
X[0], param.
X[1], param.
X[2], param.
X[3]
2008 bind_tex_link(link, *newOprod);
2010 cudaEvent_t start,
end;
2012 cudaEventCreate(&start);
2013 cudaEventCreate(&end);
2015 cudaEventRecord(start);
2019 act_path_coeff.
one = path_coeff_array[0];
2020 act_path_coeff.
naik = path_coeff_array[1];
2021 act_path_coeff.
three = path_coeff_array[2];
2022 act_path_coeff.
five = path_coeff_array[3];
2023 act_path_coeff.
seven = path_coeff_array[4];
2024 act_path_coeff.
lepage = path_coeff_array[5];
2025 do_hisq_staples_force_cuda<double,double2,double2>( act_path_coeff,
2040 act_path_coeff.
one = path_coeff_array[0];
2041 act_path_coeff.
naik = path_coeff_array[1];
2042 act_path_coeff.
three = path_coeff_array[2];
2043 act_path_coeff.
five = path_coeff_array[3];
2044 act_path_coeff.
seven = path_coeff_array[4];
2045 act_path_coeff.
lepage = path_coeff_array[5];
2047 do_hisq_staples_force_cuda<float,float2,float2>( act_path_coeff,
2063 cudaEventRecord(end);
2064 cudaEventSynchronize(end);
2066 cudaEventElapsedTime(&runtime, start, end);
2068 unbind_tex_link(link, *newOprod);