10 #ifdef GPU_WILSON_DIRAC
17 #endif // GPU_WILSON_DIRAC
20 #ifdef GPU_STAGGERED_DIRAC
21 #if (__COMPUTE_CAPABILITY__ >= 300) // Kepler works best with texture loads only
28 #elif (__COMPUTE_CAPABILITY__ >= 200)
31 #define DIRECT_ACCESS_SPINOR
36 #define DIRECT_ACCESS_FAT_LINK
43 #endif // GPU_STAGGERED_DIRAC
72 #ifdef USE_TEXTURE_OBJECTS
73 cudaTextureObject_t inTex;
74 cudaTextureObject_t inTexNorm;
75 cudaTextureObject_t xTex;
76 cudaTextureObject_t xTexNorm;
77 cudaTextureObject_t outTex;
78 cudaTextureObject_t outTexNorm;
79 cudaTextureObject_t gauge0Tex;
80 cudaTextureObject_t gauge1Tex;
81 cudaTextureObject_t longGauge0Tex;
82 cudaTextureObject_t longGauge1Tex;
83 cudaTextureObject_t cloverTex;
84 cudaTextureObject_t cloverNormTex;
93 static cudaEvent_t packEnd[
Nstream];
94 static cudaEvent_t gatherStart[
Nstream];
95 static cudaEvent_t gatherEnd[
Nstream];
96 static cudaEvent_t scatterStart[
Nstream];
97 static cudaEvent_t scatterEnd[
Nstream];
98 static cudaEvent_t dslashStart;
99 static cudaEvent_t dslashEnd;
101 static struct timeval dslashStart_h;
103 static struct timeval commsStart[
Nstream];
104 static struct timeval commsEnd[
Nstream];
108 #ifdef DSLASH_PROFILING
109 #define DSLASH_TIME_PROFILE() dslashTimeProfile()
111 static cudaEvent_t packStart[
Nstream];
112 static cudaEvent_t kernelStart[
Nstream];
113 static cudaEvent_t kernelEnd[
Nstream];
122 #define CUDA_EVENT_RECORD(a,b) cudaEventRecord(a,b)
124 #define CUDA_EVENT_RECORD(a,b)
125 #define DSLASH_TIME_PROFILE()
128 static FaceBuffer *face;
129 static cudaColorSpinorField *inSpinor;
136 unsigned long long VolumeCB() {
return x[0]*
x[1]*
x[2]*
x[3]/2; }
153 static bool kernelPackT =
false;
163 #if defined(DIRECT_ACCESS_LINK) || defined(DIRECT_ACCESS_WILSON_SPINOR) || \
164 defined(DIRECT_ACCESS_WILSON_ACCUM) || defined(DIRECT_ACCESS_WILSON_PACK_SPINOR) || \
165 defined(DIRECT_ACCESS_WILSON_INTER) || defined(DIRECT_ACCESS_WILSON_PACK_SPINOR) || \
166 defined(DIRECT_ACCESS_CLOVER)
168 static inline __device__
float short2float(
short a) {
172 static inline __device__
short float2short(
float c,
float a) {
176 static inline __device__ short4 float42short4(
float c, float4 a) {
177 return make_short4(float2short(c, a.x), float2short(c, a.y), float2short(c, a.z), float2short(c, a.w));
180 static inline __device__ float4 short42float4(short4 a) {
181 return make_float4(short2float(a.x), short2float(a.y), short2float(a.z), short2float(a.w));
184 static inline __device__ float2 short22float2(short2 a) {
185 return make_float2(short2float(a.x), short2float(a.y));
187 #endif // DIRECT_ACCESS inclusions
202 #ifndef DSLASH_SHARED_FLOATS_PER_THREAD
203 #define DSLASH_SHARED_FLOATS_PER_THREAD 0
206 #ifndef CLOVER_SHARED_FLOATS_PER_THREAD
207 #define CLOVER_SHARED_FLOATS_PER_THREAD 0
210 #ifndef NDEGTM_SHARED_FLOATS_PER_THREAD
211 #define NDEGTM_SHARED_FLOATS_PER_THREAD 0
222 #ifndef DSLASH_PROFILING
224 for (
int i=0; i<
Nstream; i++) {
225 cudaEventCreate(&packEnd[i], cudaEventDisableTiming);
226 cudaEventCreate(&gatherStart[i], cudaEventDisableTiming);
227 cudaEventCreate(&gatherEnd[i], cudaEventDisableTiming);
228 cudaEventCreateWithFlags(&scatterStart[i], cudaEventDisableTiming);
229 cudaEventCreateWithFlags(&scatterEnd[i], cudaEventDisableTiming);
231 cudaEventCreateWithFlags(&dslashStart, cudaEventDisableTiming);
232 cudaEventCreateWithFlags(&dslashEnd, cudaEventDisableTiming);
234 cudaEventCreate(&dslashStart);
235 cudaEventCreate(&dslashEnd);
236 for (
int i=0; i<
Nstream; i++) {
237 cudaEventCreate(&packStart[i]);
238 cudaEventCreate(&packEnd[i]);
240 cudaEventCreate(&gatherStart[i]);
241 cudaEventCreate(&gatherEnd[i]);
243 cudaEventCreate(&scatterStart[i]);
244 cudaEventCreate(&scatterEnd[i]);
246 cudaEventCreate(&kernelStart[i]);
247 cudaEventCreate(&kernelEnd[i]);
249 kernelTime[i][0] = 0.0;
250 kernelTime[i][1] = 0.0;
252 gatherTime[i][0] = 0.0;
253 gatherTime[i][1] = 0.0;
255 commsTime[i][0] = 0.0;
256 commsTime[i][1] = 0.0;
258 scatterTime[i][0] = 0.0;
259 scatterTime[i][1] = 0.0;
269 for (
int i=0; i<
Nstream; i++) {
270 cudaEventDestroy(packEnd[i]);
271 cudaEventDestroy(gatherStart[i]);
272 cudaEventDestroy(gatherEnd[i]);
273 cudaEventDestroy(scatterStart[i]);
274 cudaEventDestroy(scatterEnd[i]);
277 cudaEventDestroy(dslashStart);
278 cudaEventDestroy(dslashEnd);
280 #ifdef DSLASH_PROFILING
281 for (
int i=0; i<
Nstream; i++) {
282 cudaEventDestroy(packStart[i]);
283 cudaEventDestroy(kernelStart[i]);
284 cudaEventDestroy(kernelEnd[i]);
292 #define MORE_GENERIC_DSLASH(FUNC, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param, ...) \
294 if (reconstruct == QUDA_RECONSTRUCT_NO) { \
295 FUNC ## 18 ## DAG ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__ , param); \
296 } else if (reconstruct == QUDA_RECONSTRUCT_12) { \
297 FUNC ## 12 ## DAG ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__ , param); \
299 FUNC ## 8 ## DAG ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__, param); \
302 if (reconstruct == QUDA_RECONSTRUCT_NO) { \
303 FUNC ## 18 ## DAG ## X ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__, param); \
304 } else if (reconstruct == QUDA_RECONSTRUCT_12) { \
305 FUNC ## 12 ## DAG ## X ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__, param); \
306 } else if (reconstruct == QUDA_RECONSTRUCT_8) { \
307 FUNC ## 8 ## DAG ## X ## Kernel<kernel_type> <<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__, param); \
313 #define GENERIC_DSLASH(FUNC, DAG, X, gridDim, blockDim, shared, stream, param, ...) \
314 switch(param.kernel_type) { \
315 case INTERIOR_KERNEL: \
316 MORE_GENERIC_DSLASH(FUNC, DAG, X, INTERIOR_KERNEL, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
319 errorQuda("KernelType %d not defined for single GPU", param.kernel_type); \
324 #define GENERIC_DSLASH(FUNC, DAG, X, gridDim, blockDim, shared, stream, param, ...) \
325 switch(param.kernel_type) { \
326 case INTERIOR_KERNEL: \
327 MORE_GENERIC_DSLASH(FUNC, DAG, X, INTERIOR_KERNEL, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
329 case EXTERIOR_KERNEL_X: \
330 MORE_GENERIC_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_X, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
332 case EXTERIOR_KERNEL_Y: \
333 MORE_GENERIC_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_Y, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
335 case EXTERIOR_KERNEL_Z: \
336 MORE_GENERIC_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_Z, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
338 case EXTERIOR_KERNEL_T: \
339 MORE_GENERIC_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_T, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
346 #define DSLASH(FUNC, gridDim, blockDim, shared, stream, param, ...) \
348 GENERIC_DSLASH(FUNC, , Xpay, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
350 GENERIC_DSLASH(FUNC, Dagger, Xpay, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
354 #define STAGGERED_DSLASH(gridDim, blockDim, shared, stream, param, ...) \
355 GENERIC_DSLASH(staggeredDslash, , Axpy, gridDim, blockDim, shared, stream, param, __VA_ARGS__)
358 #define MORE_GENERIC_ASYM_DSLASH(FUNC, DAG, X, kernel_type, gridDim, blockDim, shared, stream, param, ...) \
359 if (reconstruct == QUDA_RECONSTRUCT_NO) { \
360 FUNC ## 18 ## DAG ## X ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__, param); \
361 } else if (reconstruct == QUDA_RECONSTRUCT_12) { \
362 FUNC ## 12 ## DAG ## X ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__, param); \
363 } else if (reconstruct == QUDA_RECONSTRUCT_8) { \
364 FUNC ## 8 ## DAG ## X ## Kernel<kernel_type> <<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__, param); \
369 #define GENERIC_ASYM_DSLASH(FUNC, DAG, X, gridDim, blockDim, shared, stream, param, ...) \
370 switch(param.kernel_type) { \
371 case INTERIOR_KERNEL: \
372 MORE_GENERIC_ASYM_DSLASH(FUNC, DAG, X, INTERIOR_KERNEL, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
375 errorQuda("KernelType %d not defined for single GPU", param.kernel_type); \
380 #define GENERIC_ASYM_DSLASH(FUNC, DAG, X, gridDim, blockDim, shared, stream, param, ...) \
381 switch(param.kernel_type) { \
382 case INTERIOR_KERNEL: \
383 MORE_GENERIC_ASYM_DSLASH(FUNC, DAG, X, INTERIOR_KERNEL, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
385 case EXTERIOR_KERNEL_X: \
386 MORE_GENERIC_ASYM_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_X, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
388 case EXTERIOR_KERNEL_Y: \
389 MORE_GENERIC_ASYM_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_Y, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
391 case EXTERIOR_KERNEL_Z: \
392 MORE_GENERIC_ASYM_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_Z, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
394 case EXTERIOR_KERNEL_T: \
395 MORE_GENERIC_ASYM_DSLASH(FUNC, DAG, X, EXTERIOR_KERNEL_T, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
402 #define ASYM_DSLASH(FUNC, gridDim, blockDim, shared, stream, param, ...) \
404 GENERIC_ASYM_DSLASH(FUNC, , Xpay, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
406 GENERIC_ASYM_DSLASH(FUNC, Dagger, Xpay, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
427 warningQuda(
"Autotuner is skipping blockDim=%u (gridDim=%u) because lattice volume is too large",
443 std::stringstream ps;
444 ps <<
"block=(" <<
param.block.x <<
"," <<
param.block.y <<
"," <<
param.block.z <<
"), ";
445 ps <<
"shared=" <<
param.shared_bytes;
455 warningQuda(
"Autotuner is skipping blockDim=%u (gridDim=%u) because lattice volume is too large",
458 if (!ok)
errorQuda(
"Lattice volume is too large for even the largest blockDim");
468 errorQuda(
"Lattice volume is too large for default blockDim");
500 std::stringstream vol, aux;
502 vol << dslashConstants.x[0] <<
"x";
503 vol << dslashConstants.x[1] <<
"x";
504 vol << dslashConstants.x[2] <<
"x";
505 vol << dslashConstants.x[3];
509 char comm[5], ghost[5];
517 for (
int i=0; i<4; i++) {
521 comm[4] =
'\0'; ghost[4] =
'\0';
522 aux <<
",comm=" << comm;
524 aux <<
",ghost=" << ghost;
529 return TuneKey(vol.str(),
typeid(*this).name(), aux.str());
535 #if (__COMPUTE_CAPABILITY__ >= 200 && defined(SHARED_WILSON_DSLASH))
545 int sharedBytes(
const dim3 &block)
const {
547 int block_xy = block.x*block.y;
548 if (block_xy % warpSize != 0) block_xy = ((block_xy / warpSize) + 1)*warpSize;
553 dim3 createGrid(
const dim3 &block)
const {
554 unsigned int gx = ((dslashConstants.x[0]/2)*dslashConstants.x[3] + block.x - 1) / block.x;
555 unsigned int gy = (dslashConstants.x[1] + block.y - 1 ) / block.y;
556 unsigned int gz = (dslashConstants.x[2] + block.z - 1) / block.z;
557 return dim3(gx, gy, gz);
563 const unsigned int min_threads = 2;
564 const unsigned int max_threads = 512;
565 const unsigned int max_shared = 16384*3;
569 dim3 blockInit = param.block;
571 for (
unsigned bx=blockInit.x; bx<=dslashConstants.x[0]/2; bx++) {
573 for (
unsigned by=blockInit.y; by<=dslashConstants.x[1]; by++) {
574 unsigned int gy = (dslashConstants.x[1] + by - 1 ) / by;
576 if (by > 1 && (by%2) != 0)
continue;
578 for (
unsigned bz=blockInit.z; bz<=dslashConstants.x[2]; bz++) {
579 unsigned int gz = (dslashConstants.x[2] + bz - 1) / bz;
581 if (bz > 1 && (bz%2) != 0)
continue;
582 if (bx*by*bz > max_threads)
continue;
583 if (bx*by*bz < min_threads)
continue;
585 if (by*gy != dslashConstants.x[1])
continue;
586 if (bz*gz != dslashConstants.x[2])
continue;
587 if (sharedBytes(dim3(bx, by, bz)) > max_shared)
continue;
589 param.block = dim3(bx, by, bz);
599 if (param.block.x > dslashConstants.x[0]/2 && param.block.y > dslashConstants.x[1] &&
600 param.block.z > dslashConstants.x[2] || !
set) {
602 param.block = dim3(dslashConstants.x[0]/2, 1, 1);
605 param.grid = createGrid(param.block);
606 param.shared_bytes = sharedBytes(param.block);
614 const cudaColorSpinorField *
x) :
DslashCuda(out, in, x) { ; }
616 std::string
paramString(
const TuneParam ¶m)
const
618 std::stringstream ps;
619 ps <<
"block=(" << param.block.x <<
"," << param.block.y <<
"," << param.block.z <<
"), ";
620 ps <<
"grid=(" << param.grid.x <<
"," << param.grid.y <<
"," << param.grid.z <<
"), ";
621 ps <<
"shared=" << param.shared_bytes;
629 param.block = dim3(dslashConstants.x[0]/2, 1, 1);
630 param.grid = createGrid(param.block);
631 param.shared_bytes = sharedBytes(param.block);
642 class SharedDslashCuda : public DslashCuda {
651 template <
typename sFloat,
typename gFloat>
655 const gFloat *gauge0, *gauge1;
661 int sharedBytesPerThread()
const
663 #if (__COMPUTE_CAPABILITY__ >= 200) // Fermi uses shared memory for common input
665 int reg_size = (
typeid(sFloat)==
typeid(double2) ?
sizeof(double) :
sizeof(
float));
670 #else // Pre-Fermi uses shared memory only for pseudo-registers
671 int reg_size = (
typeid(sFloat)==
typeid(double2) ?
sizeof(double) :
sizeof(
float));
681 reconstruct(reconstruct), dagger(dagger), a(a)
683 bindSpinorTex<sFloat>(
in,
out,
x);
690 TuneKey key = DslashCuda::tuneKey();
691 std::stringstream recon;
692 recon << reconstruct;
693 key.
aux +=
",reconstruct=" + recon.str();
694 if (
x) key.
aux +=
",Xpay";
700 #ifdef SHARED_WILSON_DSLASH
702 errorQuda(
"Shared dslash does not yet support X-dimension partitioning");
707 (sFloat*)
in->V(), (
float*)
in->Norm(), (sFloat*)(
x ?
x->V() : 0), (
float*)(
x ?
x->Norm() : 0), a);
710 long long flops()
const {
return (
x ? 1368ll : 1320ll) * dslashConstants.VolumeCB(); }
713 template <
typename sFloat,
typename gFloat,
typename cFloat>
717 const gFloat *gauge0, *gauge1;
719 const cFloat *clover;
720 const float *cloverNorm;
725 int sharedBytesPerThread()
const
727 #if (__COMPUTE_CAPABILITY__ >= 200)
729 int reg_size = (
typeid(sFloat)==
typeid(double2) ?
sizeof(double) :
sizeof(
float));
735 int reg_size = (
typeid(sFloat)==
typeid(double2) ?
sizeof(double) :
sizeof(
float));
744 :
SharedDslashCuda(out, in, x), gauge0(gauge0), gauge1(gauge1), clover(clover),
745 cloverNorm(cloverNorm), reconstruct(reconstruct), dagger(dagger), a(a)
747 bindSpinorTex<sFloat>(
in,
out,
x);
753 TuneKey key = DslashCuda::tuneKey();
754 std::stringstream recon;
755 recon << reconstruct;
756 key.
aux +=
",reconstruct=" + recon.str();
757 if (
x) key.
aux +=
",Xpay";
763 #ifdef SHARED_WILSON_DSLASH
765 errorQuda(
"Shared dslash does not yet support X-dimension partitioning");
769 (sFloat*)
out->V(), (
float*)
out->Norm(), gauge0, gauge1, clover, cloverNorm,
770 (sFloat*)
in->V(), (
float*)
in->Norm(), (sFloat*)(
x ?
x->V() : 0), (
float*)(
x ?
x->Norm() : 0), a);
773 long long flops()
const {
return (
x ? 1872ll : 1824ll) * dslashConstants.VolumeCB(); }
776 template <
typename sFloat,
typename gFloat,
typename cFloat>
780 const gFloat *gauge0, *gauge1;
782 const cFloat *clover;
783 const float *cloverNorm;
788 int sharedBytesPerThread()
const
790 #if (__COMPUTE_CAPABILITY__ >= 200)
792 int reg_size = (
typeid(sFloat)==
typeid(double2) ?
sizeof(double) :
sizeof(
float));
798 int reg_size = (
typeid(sFloat)==
typeid(double2) ?
sizeof(double) :
sizeof(
float));
808 :
SharedDslashCuda(out, in, x), gauge0(gauge0), gauge1(gauge1), clover(clover),
809 cloverNorm(cloverNorm), reconstruct(reconstruct), dagger(dagger), a(a)
811 bindSpinorTex<sFloat>(
in,
out,
x);
812 if (!x)
errorQuda(
"Asymmetric clover dslash only defined for Xpay");
818 TuneKey key = DslashCuda::tuneKey();
819 std::stringstream recon;
820 recon << reconstruct;
821 key.
aux +=
",reconstruct=" + recon.str() +
",Xpay";
827 #ifdef SHARED_WILSON_DSLASH
829 errorQuda(
"Shared dslash does not yet support X-dimension partitioning");
833 (sFloat*)
out->V(), (
float*)
out->Norm(), gauge0, gauge1, clover, cloverNorm,
834 (sFloat*)
in->V(), (
float*)
in->Norm(), (sFloat*)
x, (
float*)
x->Norm(), a);
837 long long flops()
const {
return 1872ll * dslashConstants.VolumeCB(); }
843 a = 2.0 * kappa *
mu;
846 a = -2.0 * kappa *
mu;
847 b = 1.0 / (1.0 + a*a);
849 errorQuda(
"Twist type %d not defined\n", twist);
851 if (dagger) a *= -1.0;
855 template <
typename sFloat,
typename gFloat>
859 const gFloat *gauge0, *gauge1;
865 int sharedBytesPerThread()
const
867 #if (__COMPUTE_CAPABILITY__ >= 200)
869 int reg_size = (
typeid(sFloat)==
typeid(double2) ?
sizeof(double) :
sizeof(
float));
875 int reg_size = (
typeid(sFloat)==
typeid(double2) ?
sizeof(double) :
sizeof(
float));
884 const double epsilon,
const int dagger)
886 reconstruct(reconstruct), dagger(dagger)
888 bindSpinorTex<sFloat>(
in,
out,
x);
897 a =
kappa, b =
mu, c = epsilon;
904 TuneKey key = DslashCuda::tuneKey();
905 std::stringstream recon;
906 recon << reconstruct;
907 key.
aux +=
",reconstruct=" + recon.str();
908 if (
x) key.
aux +=
",Xpay";
914 #ifdef SHARED_WILSON_DSLASH
916 errorQuda(
"Shared dslash does not yet support X-dimension partitioning");
922 (sFloat*)
out->V(), (
float*)
out->Norm(), gauge0, gauge1,
923 (sFloat*)
in->V(), (
float*)
in->Norm(), a, b, (sFloat*)(
x ?
x->V() : 0), (
float*)(
x ?
x->Norm() : 0));
927 (sFloat*)
out->V(), (
float*)
out->Norm(), gauge0, gauge1,
928 (sFloat*)
in->V(), (
float*)
in->Norm(), a, b, c, (sFloat*)(
x ?
x->V() : 0), (
float*)(
x ?
x->Norm() : 0));
932 long long flops()
const {
return (
x ? 1416ll : 1392ll) * dslashConstants.VolumeCB(); }
935 template <
typename sFloat,
typename gFloat>
939 const gFloat *gauge0, *gauge1;
947 warningQuda(
"Autotuner is skipping blockDim=(%u,%u,%u), gridDim=(%u,%u,%u) because lattice volume is too large",
959 const unsigned int max_shared = 16384;
960 const int step[2] = {
deviceProp.warpSize, 1 };
961 bool advance[2] = {
false,
false };
964 param.
block.x += step[0];
966 sharedBytesPerThread()*param.
block.x*param.
block.y > max_shared) {
968 param.
block.x = step[0];
974 param.
block.y += step[1];
976 if (param.
block.y >
in->X(4) ||
977 sharedBytesPerThread()*param.
block.x*param.
block.y > max_shared) {
979 param.
block.y = step[1];
985 if (advance[0] || advance[1]) {
990 if (!checkGrid(param)) advance = advanceBlockDim(param);
1003 const double a,
const int dagger)
1004 :
DslashCuda(out, in, x), gauge0(gauge0), gauge1(gauge1), mferm(mferm),
1005 reconstruct(reconstruct), dagger(dagger), a(a)
1007 bindSpinorTex<sFloat>(
in,
out,
x);
1013 Tunable::initTuneParam(param);
1017 if (!checkGrid(param)) ok = advanceBlockDim(param);
1018 if (!ok)
errorQuda(
"Lattice volume is too large for even the largest blockDim");
1024 Tunable::defaultTuneParam(param);
1028 if (!checkGrid(param)) ok = advanceBlockDim(param);
1029 if (!ok)
errorQuda(
"Lattice volume is too large for even the largest blockDim");
1034 TuneKey key = DslashCuda::tuneKey();
1035 std::stringstream ls, recon;
1036 ls << dslashConstants.Ls;
1037 recon << reconstruct;
1038 key.
volume +=
"x" + ls.str();
1039 key.
aux +=
",reconstruct=" + recon.str();
1040 if (
x) key.
aux +=
",Xpay";
1048 (sFloat*)
out->V(), (
float*)
out->Norm(), gauge0, gauge1,
1049 (sFloat*)
in->V(), (
float*)
in->Norm(), mferm, (sFloat*)(
x ?
x->V() : 0), (
float*)(
x ?
x->Norm() : 0), a);
1053 long long bulk = (dslashConstants.Ls-2)*(dslashConstants.VolumeCB()/dslashConstants.Ls);
1054 long long wall = 2*dslashConstants.VolumeCB()/dslashConstants.Ls;
1055 return (
x ? 1368ll : 1320ll)*dslashConstants.VolumeCB()*dslashConstants.Ls + 96ll*bulk + 120ll*wall;
1059 template <
typename sFloat,
typename fatGFloat,
typename longGFloat>
1063 const fatGFloat *fat0, *fat1;
1064 const longGFloat *long0, *long1;
1070 int sharedBytesPerThread()
const
1072 int reg_size = (
typeid(sFloat)==
typeid(double2) ?
sizeof(double) :
sizeof(
float));
1073 return 6 * reg_size;
1078 const longGFloat *long0,
const longGFloat *long1,
1081 :
DslashCuda(out, in, x), fat0(fat0), fat1(fat1), long0(long0), long1(long1),
1082 reconstruct(reconstruct), dagger(dagger), a(a)
1084 bindSpinorTex<sFloat>(
in,
out,
x);
1091 TuneKey key = DslashCuda::tuneKey();
1092 std::stringstream recon;
1093 recon << reconstruct;
1094 key.
aux +=
",reconstruct=" + recon.str();
1095 if (
x) key.
aux +=
",Axpy";
1104 (sFloat*)
out->V(), (
float*)
out->Norm(), fat0, fat1, long0, long1,
1105 (sFloat*)
in->V(), (
float*)
in->Norm(), (sFloat*)(
x ?
x->V() : 0), (
float*)(
x ?
x->Norm() : 0), a);
1110 long long flops()
const {
return (
x ? 1158ll : 1146ll) * dslashConstants.VolumeCB(); }
1113 #ifdef DSLASH_PROFILING
1115 #define TDIFF(a,b) 1e3*(b.tv_sec - a.tv_sec + 1e-6*(b.tv_usec - a.tv_usec))
1117 void dslashTimeProfile() {
1119 cudaEventSynchronize(dslashEnd);
1121 cudaEventElapsedTime(&runTime, dslashStart, dslashEnd);
1122 dslashTime += runTime;
1124 for (
int i=4; i>=0; i--) {
1128 cudaEventElapsedTime(&runTime, dslashStart, kernelStart[2*i]);
1129 kernelTime[2*i][0] += runTime;
1130 cudaEventElapsedTime(&runTime, dslashStart, kernelEnd[2*i]);
1131 kernelTime[2*i][1] += runTime;
1135 for (
int i=3; i>=0; i--) {
1140 cudaEventElapsedTime(&runTime, dslashStart, packStart[2*i+
dir]);
1141 packTime[2*i+
dir][0] += runTime;
1142 cudaEventElapsedTime(&runTime, dslashStart, packEnd[2*i+
dir]);
1143 packTime[2*i+
dir][1] += runTime;
1146 cudaEventElapsedTime(&runTime, dslashStart, gatherStart[2*i+
dir]);
1147 gatherTime[2*i+
dir][0] += runTime;
1148 cudaEventElapsedTime(&runTime, dslashStart, gatherEnd[2*i+
dir]);
1149 gatherTime[2*i+
dir][1] += runTime;
1152 runTime =
TDIFF(dslashStart_h, commsStart[2*i+
dir]);
1153 commsTime[2*i+
dir][0] += runTime;
1154 runTime =
TDIFF(dslashStart_h, commsEnd[2*i+
dir]);
1155 commsTime[2*i+
dir][1] += runTime;
1158 cudaEventElapsedTime(&runTime, dslashStart, scatterStart[2*i+
dir]);
1159 scatterTime[2*i+
dir][0] += runTime;
1160 cudaEventElapsedTime(&runTime, dslashStart, scatterEnd[2*i+
dir]);
1161 scatterTime[2*i+
dir][1] += runTime;
1168 void printDslashProfile() {
1170 printfQuda(
"Total Dslash time = %6.2f\n", dslashTime);
1172 char dimstr[8][8] = {
"X-",
"X+",
"Y-",
"Y+",
"Z-",
"Z+",
"T-",
"T+"};
1174 printfQuda(
" %13s %13s %13s %13s %13s\n",
"Pack",
"Gather",
"Comms",
"Scatter",
"Kernel");
1175 printfQuda(
" %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s\n",
1176 "Start",
"End",
"Start",
"End",
"Start",
"End",
"Start",
"End",
"Start",
"End");
1178 printfQuda(
"%8s %55s %6.2f %6.2f\n",
"Interior",
"", kernelTime[8][0], kernelTime[8][1]);
1180 for (
int i=3; i>=0; i--) {
1187 printfQuda(
"%6.2f %6.2f ", gatherTime[2*i+
dir][0], gatherTime[2*i+
dir][1]);
1189 printfQuda(
"%6.2f %6.2f ", scatterTime[2*i+
dir][0], scatterTime[2*i+
dir][1]);
1192 if (
dir==0)
printfQuda(
"%6.2f %6.2f\n", kernelTime[2*i][0], kernelTime[2*i][1]);
1210 for (
int i=0; i<
Nstream-1; i++) {
1211 gatherCompleted[i] = 0;
1212 commsCompleted[i] = 0;
1213 dslashCompleted[i] = 0;
1215 gatherCompleted[Nstream-1] = 1;
1216 commsCompleted[Nstream-1] = 1;
1222 for (
int i=3; i>=0; i--) {
1224 int prev = Nstream-1;
1226 previousDir[2*i + 1] = prev;
1227 previousDir[2*i + 0] = 2*i + 1;
1239 const int volume,
const int *faceVolumeCB) {
1245 gettimeofday(&dslashStart_h, NULL);
1257 for(
int i = 3; i >=0; i--){
1262 cudaStreamWaitEvent(
streams[2*i+
dir], packEnd[0], 0);
1264 cudaStreamWaitEvent(
streams[2*i+
dir], dslashStart, 0);
1267 face->
gather(*inSpinor, dagger, 2*i+
dir);
1280 int completeSum = 0;
1281 while (completeSum < commDimTotal) {
1282 for (
int i=3; i>=0; i--) {
1288 if (!gatherCompleted[2*i+
dir] && gatherCompleted[previousDir[2*i+
dir]]) {
1289 if (cudaSuccess == cudaEventQuery(gatherEnd[2*i+
dir])) {
1290 gatherCompleted[2*i+
dir] = 1;
1292 gettimeofday(&commsStart[2*i+
dir], NULL);
1298 if (!commsCompleted[2*i+
dir] && commsCompleted[previousDir[2*i+
dir]] &&
1299 gatherCompleted[2*i+
dir]) {
1301 commsCompleted[2*i+
dir] = 1;
1303 gettimeofday(&commsEnd[2*i+
dir], NULL);
1313 if (!dslashCompleted[2*i] && commsCompleted[2*i] && commsCompleted[2*i+1] ) {
1315 cudaEventRecord(scatterEnd[2*i],
streams[2*i]);
1325 dslashCompleted[2*i] = 1;
1343 #ifdef GPU_WILSON_DIRAC
1345 for(
int i=0;i<4;i++){
1352 void *gauge0, *gauge1;
1356 errorQuda(
"Mixing gauge %d and spinor %d precision not supported",
1360 size_t regSize =
sizeof(float);
1362 #if (__COMPUTE_CAPABILITY__ >= 130)
1365 regSize =
sizeof(double);
1367 errorQuda(
"Double precision not supported on this GPU");
1383 errorQuda(
"Wilson dslash has not been built");
1384 #endif // GPU_WILSON_DIRAC
1394 #ifdef GPU_CLOVER_DIRAC
1396 for(
int i=0;i<4;i++){
1403 void *cloverP, *cloverNormP;
1406 void *gauge0, *gauge1;
1410 errorQuda(
"Mixing gauge and spinor precision not supported");
1413 errorQuda(
"Mixing clover and spinor precision not supported");
1416 size_t regSize =
sizeof(float);
1419 #if (__COMPUTE_CAPABILITY__ >= 130)
1423 regSize =
sizeof(double);
1425 errorQuda(
"Double precision not supported on this GPU");
1445 errorQuda(
"Clover dslash has not been built");
1457 #ifdef GPU_CLOVER_DIRAC
1459 for(
int i=0;i<4;i++){
1466 void *cloverP, *cloverNormP;
1469 void *gauge0, *gauge1;
1473 errorQuda(
"Mixing gauge and spinor precision not supported");
1476 errorQuda(
"Mixing clover and spinor precision not supported");
1479 size_t regSize =
sizeof(float);
1482 #if (__COMPUTE_CAPABILITY__ >= 130)
1486 regSize =
sizeof(double);
1488 errorQuda(
"Double precision not supported on this GPU");
1508 errorQuda(
"Clover dslash has not been built");
1516 const double &epsilon,
const int *commOverride)
1519 #ifdef GPU_TWISTED_MASS_DIRAC
1522 int ghost_threads[4] = {0};
1525 for(
int i=0;i<4;i++){
1533 void *gauge0, *gauge1;
1537 errorQuda(
"Mixing gauge and spinor precision not supported");
1540 size_t regSize =
sizeof(float);
1543 #if (__COMPUTE_CAPABILITY__ >= 130)
1544 dslash =
new TwistedDslashCuda<double2,double2>(
out, (double2*)gauge0,(double2*)gauge1, gauge.
Reconstruct(),
in,
x,
kappa,
mu, epsilon,
dagger);
1545 regSize =
sizeof(double);
1547 errorQuda(
"Double precision not supported on this GPU");
1550 dslash =
new TwistedDslashCuda<float4,float4>(
out, (float4*)gauge0,(float4*)gauge1, gauge.
Reconstruct(),
in,
x,
kappa,
mu, epsilon,
dagger);
1553 dslash =
new TwistedDslashCuda<short4,short4>(
out, (short4*)gauge0,(short4*)gauge1, gauge.
Reconstruct(),
in,
x,
kappa,
mu, epsilon,
dagger);
1556 dslashCuda(*dslash, regSize, parity, dagger, bulk_threads, ghost_threads);
1563 errorQuda(
"Twisted mass dslash has not been built");
1575 #ifdef GPU_DOMAIN_WALL_DIRAC
1579 for(
int i = 0;i < dirs; i++){
1586 void *gauge0, *gauge1;
1590 errorQuda(
"Mixing gauge and spinor precision not supported");
1593 size_t regSize =
sizeof(float);
1596 #if (__COMPUTE_CAPABILITY__ >= 130)
1599 regSize =
sizeof(double);
1601 errorQuda(
"Double precision not supported on this GPU");
1614 for (
int i=0; i<4; i++) ghostFace[i] = in->
GhostFace()[i] / in->
X(4);
1622 errorQuda(
"Domain wall dslash has not been built");
1629 const double &k,
const int *commOverride)
1633 #ifdef GPU_STAGGERED_DIRAC
1636 for(
int i=0;i < 4; i++){
1638 errorQuda(
"ERROR: partitioned dimension with local size less than 6 is not supported in staggered dslash\n");
1647 for(
int i=0;i<4;i++){
1653 void *fatGauge0, *fatGauge1;
1654 void* longGauge0, *longGauge1;
1659 errorQuda(
"Mixing gauge and spinor precision not supported"
1660 "(precision=%d, fatlinkGauge.precision=%d, longGauge.precision=%d",
1665 size_t regSize =
sizeof(float);
1668 #if (__COMPUTE_CAPABILITY__ >= 130)
1670 (double2*)longGauge0, (double2*)longGauge1,
1672 regSize =
sizeof(double);
1674 errorQuda(
"Double precision not supported on this GPU");
1678 (float4*)longGauge0, (float4*)longGauge1,
1682 (short4*)longGauge0, (short4*)longGauge1,
1695 errorQuda(
"Staggered dslash has not been built");
1696 #endif // GPU_STAGGERED_DIRAC
1700 template <
typename sFloat,
typename cFloat>
1705 char *saveOut, *saveOutNorm;
1706 const cFloat *clover;
1707 const float *cloverNorm;
1711 int sharedBytesPerThread()
const
1713 int reg_size = (
typeid(sFloat)==
typeid(double2) ?
sizeof(double) :
sizeof(
float));
1722 : out(out), clover(clover), cloverNorm(cloverNorm), in(in)
1724 bindSpinorTex<sFloat>(
in);
1732 ((sFloat*)
out->V(), (
float*)
out->Norm(), clover, cloverNorm,
1737 std::stringstream vol, aux;
1738 vol << dslashConstants.x[0] <<
"x";
1739 vol << dslashConstants.x[1] <<
"x";
1740 vol << dslashConstants.x[2] <<
"x";
1741 vol << dslashConstants.x[3];
1742 return TuneKey(vol.str(),
typeid(*this).name());
1748 saveOut =
new char[
out->Bytes()];
1749 cudaMemcpy(saveOut,
out->V(),
out->Bytes(), cudaMemcpyDeviceToHost);
1750 if (
typeid(sFloat) ==
typeid(short4)) {
1751 saveOutNorm =
new char[
out->NormBytes()];
1752 cudaMemcpy(saveOutNorm,
out->Norm(),
out->NormBytes(), cudaMemcpyDeviceToHost);
1760 cudaMemcpy(
out->V(), saveOut,
out->Bytes(), cudaMemcpyHostToDevice);
1762 if (
typeid(sFloat) ==
typeid(short4)) {
1763 cudaMemcpy(
out->Norm(), saveOutNorm,
out->NormBytes(), cudaMemcpyHostToDevice);
1764 delete[] saveOutNorm;
1771 std::stringstream ps;
1772 ps <<
"block=(" << param.block.x <<
"," << param.block.y <<
"," << param.block.z <<
"), ";
1773 ps <<
"shared=" << param.shared_bytes;
1777 long long flops()
const {
return 504ll * dslashConstants.VolumeCB(); }
1787 #ifdef GPU_CLOVER_DIRAC
1789 void *cloverP, *cloverNormP;
1793 errorQuda(
"Mixing clover and spinor precision not supported");
1796 #if (__COMPUTE_CAPABILITY__ >= 130)
1799 errorQuda(
"Double precision not supported on this GPU");
1813 errorQuda(
"Clover dslash has not been built");
1818 template <
typename sFloat>
1828 int sharedBytesPerThread()
const {
return 0; }
1829 int sharedBytesPerBlock(
const TuneParam ¶m)
const {
return 0; }
1830 bool advanceGridDim(
TuneParam ¶m)
const {
return false; }
1832 char *saveOut, *saveOutNorm;
1839 bindSpinorTex<sFloat>(
in);
1843 a =
kappa, b =
mu, c = epsilon;
1847 unbindSpinorTex<sFloat>(
in);
1851 std::stringstream vol, aux;
1852 vol << dslashConstants.x[0] <<
"x";
1853 vol << dslashConstants.x[1] <<
"x";
1854 vol << dslashConstants.x[2] <<
"x";
1855 vol << dslashConstants.x[3];
1856 return TuneKey(vol.str(),
typeid(*this).name(), aux.str());
1871 ((sFloat*)
out->V(), (
float*)
out->Norm(), a, b, c, (sFloat*)
in->V(), (
float*)
in->Norm(),
dslashParam);
1876 saveOut =
new char[
out->Bytes()];
1877 cudaMemcpy(saveOut,
out->V(),
out->Bytes(), cudaMemcpyDeviceToHost);
1878 if (
typeid(sFloat) ==
typeid(short4)) {
1879 saveOutNorm =
new char[
out->NormBytes()];
1880 cudaMemcpy(saveOutNorm,
out->Norm(),
out->NormBytes(), cudaMemcpyDeviceToHost);
1885 cudaMemcpy(
out->V(), saveOut,
out->Bytes(), cudaMemcpyHostToDevice);
1887 if (
typeid(sFloat) ==
typeid(short4)) {
1888 cudaMemcpy(
out->Norm(), saveOutNorm,
out->NormBytes(), cudaMemcpyHostToDevice);
1889 delete[] saveOutNorm;
1894 std::stringstream ps;
1895 ps <<
"block=(" << param.
block.x <<
"," << param.
block.y <<
"," << param.
block.z <<
"), ";
1900 long long flops()
const {
return 24ll * dslashConstants.VolumeCB(); }
1912 #if (defined GPU_TWISTED_MASS_DIRAC) || (defined GPU_NDEG_TWISTED_MASS_DIRAC)
1916 #if (__COMPUTE_CAPABILITY__ >= 130)
1917 twistGamma5 =
new TwistGamma5Cuda<double2>(
out,
in,
kappa,
mu, epsilon,
dagger, twist);
1919 errorQuda(
"Double precision not supported on this GPU");
1922 twistGamma5 =
new TwistGamma5Cuda<float4>(
out,
in,
kappa,
mu, epsilon,
dagger, twist);
1924 twistGamma5 =
new TwistGamma5Cuda<short4>(
out,
in,
kappa,
mu, epsilon,
dagger, twist);
1932 errorQuda(
"Twisted mass dslash has not been built");
1933 #endif // GPU_TWISTED_MASS_DIRAC
1941 #if defined(GPU_FATLINK) || defined(GPU_GAUGE_FORCE) || defined(GPU_FERMION_FORCE) || defined(GPU_HISQ_FORCE) || defined(GPU_UNITARIZE)
1949 #ifdef GPU_GAUGE_FORCE
1953 #ifdef GPU_FERMION_FORCE
1957 #ifdef GPU_UNITARIZE
1961 #ifdef GPU_HISQ_FORCE