28 #include <dslash_index.cuh>
34 #include <dslash_quda.cuh>
37 #include <dslash_init.cuh>
41 #include <dslash_events.cuh>
43 using namespace covdev;
49 #define MORE_GENERIC_COVDEV(FUNC, dir, DAG, kernel_type, gridDim, blockDim, shared, stream, param, ...) \
50 if (reconstruct == QUDA_RECONSTRUCT_NO) { \
53 FUNC ## 018 ## DAG ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__ , param); \
56 FUNC ## 118 ## DAG ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__ , param); \
59 FUNC ## 218 ## DAG ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__ , param); \
62 FUNC ## 318 ## DAG ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__ , param); \
65 } else if (reconstruct == QUDA_RECONSTRUCT_12) { \
68 FUNC ## 012 ## DAG ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__ , param); \
71 FUNC ## 112 ## DAG ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__ , param); \
74 FUNC ## 212 ## DAG ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__ , param); \
77 FUNC ## 312 ## DAG ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__ , param); \
80 } else if (reconstruct == QUDA_RECONSTRUCT_8) { \
83 FUNC ## 08 ## DAG ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__ , param); \
86 FUNC ## 18 ## DAG ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__ , param); \
89 FUNC ## 28 ## DAG ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__ , param); \
92 FUNC ## 38 ## DAG ## Kernel<kernel_type><<<gridDim, blockDim, shared, stream>>> ( __VA_ARGS__ , param); \
97 #define GENERIC_COVDEV(FUNC, dir, DAG, gridDim, blockDim, shared, stream, param, ...) \
98 switch(param.kernel_type) { \
99 case INTERIOR_KERNEL: \
100 MORE_GENERIC_COVDEV(FUNC, dir, DAG, INTERIOR_KERNEL, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
102 case EXTERIOR_KERNEL_X: \
103 MORE_GENERIC_COVDEV(FUNC, dir, DAG, EXTERIOR_KERNEL_X, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
105 case EXTERIOR_KERNEL_Y: \
106 MORE_GENERIC_COVDEV(FUNC, dir, DAG, EXTERIOR_KERNEL_Y, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
108 case EXTERIOR_KERNEL_Z: \
109 MORE_GENERIC_COVDEV(FUNC, dir, DAG, EXTERIOR_KERNEL_Z, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
111 case EXTERIOR_KERNEL_T: \
112 MORE_GENERIC_COVDEV(FUNC, dir, DAG, EXTERIOR_KERNEL_T, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
116 #define COVDEV(FUNC, mu, gridDim, blockDim, shared, stream, param, ...) \
118 GENERIC_COVDEV(FUNC, mu, , gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
121 GENERIC_COVDEV(FUNC, nMu, Dagger, gridDim, blockDim, shared, stream, param, __VA_ARGS__) \
124 #define PROFILE(f, profile, idx) \
125 profile.Start(idx); \
133 void covDevCuda(DslashCuda &dslash,
const size_t regSize,
const int mu, TimeProfile &profile)
137 const int dir = mu%4;
148 dslashParam.kernel_type =
static_cast<KernelType>(dir);
156 dslashParam.ghostDim[dir] = 0;
157 dslashParam.commDim[dir] = 0;
168 template <
typename Float,
typename Float2>
169 class CovDevCuda :
public SharedDslashCuda
172 const cudaGaugeField *
gauge;
177 void *gauge0, *gauge1;
191 #ifdef USE_TEXTURE_OBJECTS
192 cudaTextureObject_t tex;
196 unsigned int minThreads()
const
199 if(dslashParam.kernel_type ==
INTERIOR_KERNEL) {
return in->Volume(); }
else {
return ghostVolume; }
205 unsigned int sharedBytesPerThread()
const {
return 0; }
207 dim3 createGrid(
const dim3 &block)
const
209 unsigned int gx = (dslashParam.threads + block.x - 1) / block.x;
213 return dim3(gx, gy, gz);
220 bool advanceBlockDim(TuneParam &
param)
const
223 const unsigned int min_threads = 2;
224 const unsigned int max_threads = 512;
229 param.grid = createGrid(param.block);
231 if((param.block.x > min_threads) && (param.block.x < max_threads))
243 void allocateGhosts()
245 if(cudaMalloc(&ghostBuffer, ghostBytes) != cudaSuccess) {
246 printf(
"Error in rank %d: Unable to allocate %d bytes for GPU ghosts\n",
comm_rank(), ghostBytes);
257 void exchangeGhosts()
259 const int rel = (mu < 4) ? 1 : -1;
265 if(cudaHostAlloc(&send, ghostBytes, 0) != cudaSuccess) {
266 printf(
"Error in rank %d: Unable to allocate %d bytes for MPI requests (send)\n",
comm_rank(), ghostBytes);
271 if(cudaHostAlloc(&recv, ghostBytes, 0) != cudaSuccess) {
272 printf(
"Error in rank %d: Unable to allocate %d bytes for MPI requests (recv)\n",
comm_rank(), ghostBytes);
281 void *sendFacePtr = (
char*)
in->V();
282 size_t len = Nvec*
sizeof(
Float);
283 size_t skip = len*
in->X(0);
284 size_t dpitch = ghostVolume*Nvec*
sizeof(
Float);
285 size_t spitch =
in->Stride()*Nvec*
sizeof(
Float);
287 for(
int t=0;t<ghostVolume;t++) {
288 cudaMemcpy2DAsync((
void*) (((
char*)send)+len*t), dpitch, (
void*) (((
char*)sendFacePtr)+skip*t),
289 spitch, len, Npad, cudaMemcpyDeviceToHost,
streams[0]);
290 cudaStreamSynchronize(
streams[0]);
296 void *sendFacePtr = (
char*)
in->V();
297 size_t len =
in->X(0)*Nvec*
sizeof(
Float);
298 size_t skip = len*
in->X(1);
299 size_t dpitch = ghostVolume*Nvec*
sizeof(
Float);
300 size_t spitch =
in->Stride()*Nvec*
sizeof(
Float);
303 cudaMemcpy2DAsync((
void*) (((
char*)send)+len*
tz), dpitch, (
void*) (((
char*)sendFacePtr)+skip*tz),
304 spitch, len, Npad, cudaMemcpyDeviceToHost,
streams[0]);
305 cudaStreamSynchronize(
streams[0]);
311 void *sendFacePtr = (
char*)
in->V();
312 size_t len = ghostVolume*Nvec*
sizeof(
Float)/
in->X(3);
313 size_t skip = len*
in->X(2);
314 size_t dpitch = ghostVolume*Nvec*
sizeof(
Float);
315 size_t spitch =
in->Stride()*Nvec*
sizeof(
Float);
317 for(
int t=0;t<
in->X(3);t++) {
318 cudaMemcpy2DAsync((
void*) (((
char*)send)+len*t), dpitch, (
void*) (((
char*)sendFacePtr)+skip*t),
319 spitch, len, Npad, cudaMemcpyDeviceToHost,
streams[0]);
320 cudaStreamSynchronize(
streams[0]);
326 void *sendFacePtr = (
char*)
in->V();
327 size_t len = ghostVolume*Nvec*
sizeof(
Float);
328 size_t spitch =
in->Stride()*Nvec*
sizeof(
Float);
329 cudaMemcpy2DAsync(send, len, sendFacePtr, spitch, len, Npad, cudaMemcpyDeviceToHost,
streams[0]);
330 cudaStreamSynchronize(
streams[0]);
337 void *sendFacePtr = (
char*)
in->V() + offset*Nvec*
sizeof(
Float);
338 size_t len = Nvec*
sizeof(
Float);
339 size_t skip = len*
in->X(0);
340 size_t dpitch = ghostVolume*Nvec*
sizeof(
Float);
341 size_t spitch =
in->Stride()*Nvec*
sizeof(
Float);
343 for(
int t=0;t<ghostVolume;t++) {
344 cudaMemcpy2DAsync((
void*) (((
char*)send)+len*t), dpitch, (
void*) (((
char*)sendFacePtr)+skip*t),
345 spitch, len, Npad, cudaMemcpyDeviceToHost,
streams[0]);
346 cudaStreamSynchronize(
streams[0]);
352 void *sendFacePtr = ((
char*)
in->V()) + offset*Nvec*
sizeof(
Float);
353 size_t len =
in->X(0)*Nvec*
sizeof(
Float);
354 size_t skip = len*
in->X(1);
355 size_t dpitch = ghostVolume*Nvec*
sizeof(
Float);
356 size_t spitch =
in->Stride()*Nvec*
sizeof(
Float);
358 for(
int tz=0;tz<(
in->X(2)*
in->X(3));tz++) {
359 cudaMemcpy2DAsync((
void*) (((
char*)send)+len*tz), dpitch, (
void*) (((
char*)sendFacePtr)+skip*tz),
360 spitch, len, Npad, cudaMemcpyDeviceToHost,
streams[0]);
361 cudaStreamSynchronize(
streams[0]);
367 void *sendFacePtr = (((
char*)
in->V()) + offset*Nvec*
sizeof(
Float));
368 size_t len = ghostVolume*Nvec*
sizeof(
Float)/
in->X(3);
369 size_t skip = len*
in->X(2);
370 size_t dpitch = ghostVolume*Nvec*
sizeof(
Float);
371 size_t spitch =
in->Stride()*Nvec*
sizeof(
Float);
373 for(
int t=0;t<
in->X(3);t++) {
374 cudaMemcpy2DAsync((
void*) (((
char*)send)+len*t), dpitch, (
void*) (((
char*)sendFacePtr)+skip*t),
375 spitch, len, Npad, cudaMemcpyDeviceToHost,
streams[0]);
376 cudaStreamSynchronize(
streams[0]);
382 void *sendFacePtr = (
char*)
in->V() + offset*Nvec*
sizeof(
Float);
383 size_t len = ghostVolume*Nvec*
sizeof(
Float);
384 size_t spitch =
in->Stride()*Nvec*
sizeof(
Float);
386 cudaMemcpy2DAsync(send, len, sendFacePtr, spitch, len, Npad, cudaMemcpyDeviceToHost,
streams[0]);
387 cudaStreamSynchronize(
streams[0]);
407 cudaMemcpy(ghostBuffer, recv, ghostBytes, cudaMemcpyHostToDevice);
408 cudaDeviceSynchronize();
415 void freeGhosts() { cudaFree(ghostBuffer); }
419 if(binded ==
false) {
420 #ifdef USE_TEXTURE_OBJECTS
421 cudaChannelFormatDesc desc;
422 memset(&desc, 0,
sizeof(cudaChannelFormatDesc));
424 else desc.f = cudaChannelFormatKindSigned;
428 desc.x = 8*
in->Precision();
429 desc.y = 8*
in->Precision();
439 cudaResourceDesc resDesc;
440 memset(&resDesc, 0,
sizeof(resDesc));
441 resDesc.resType = cudaResourceTypeLinear;
442 resDesc.res.linear.devPtr = ghostBuffer;
443 resDesc.res.linear.desc = desc;
444 resDesc.res.linear.sizeInBytes = Nint * ghostVolume *
sizeof(
Float);
446 cudaTextureDesc texDesc;
447 memset(&texDesc, 0,
sizeof(texDesc));
448 texDesc.readMode = cudaReadModeElementType;
450 cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
452 dslashParam.inTex = tex;
459 errorQuda(
"Half precision for covariant derivative not supported.");
466 void unbindGhosts() {
468 #ifdef USE_TEXTURE_OBJECTS
469 cudaDestroyTextureObject(tex);
489 CovDevCuda(cudaColorSpinorField *
out,
const cudaGaugeField *
gauge,
const cudaColorSpinorField *
in,
const int parity,
const int mu)
490 : SharedDslashCuda(out, in, 0, gauge->Reconstruct(), mu<4 ? 0 : 1), gauge(gauge), parity(parity), mu(mu), dir(mu%4), binded(false)
492 bindSpinorTex<Float2>(
in,
out);
497 Nvec =
sizeof(Float2)/
sizeof(
Float);
498 Nint = in->Ncolor()*in->Nspin()*Nvec;
503 ghostVolume = in->X(1)*in->X(2)*in->X(3)/2;
504 offset = in->X(0) - 1;
508 ghostVolume = in->X(0)*in->X(2)*in->X(3);
509 offset = in->X(0)*(in->X(1) - 1);
513 ghostVolume = in->X(0)*in->X(1)*in->X(3);
514 offset = in->X(0)*in->X(1)*(in->X(2) - 1);
518 ghostVolume = in->X(0)*in->X(1)*in->X(2);
519 offset = in->Volume() - ghostVolume;
523 ghostBytes = ghostVolume*Nint*
sizeof(
Float);
529 virtual ~CovDevCuda() {
539 void apply(
const cudaStream_t &
stream) {
540 #ifdef SHARED_WILSON_DSLASH
542 errorQuda(
"Shared dslash (covariant derivative) does not yet support X-dimension partitioning");
545 errorQuda(
"Covariant derivative does not yet support X or Y-dimension partitioning");
547 dslashParam.parity =
parity;
549 for(
int i=0; i<4; i++) {
550 dslashParam.ghostDim[i] = 0;
551 dslashParam.ghostOffset[i] = 0;
552 dslashParam.ghostNormOffset[i] = 0;
553 dslashParam.commDim[i] = 0;
558 dslashParam.threads = ghostVolume;
565 COVDEV(covDevM, mu, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam, (Float2*)out->V(), (Float2*)gauge0, (Float2*)gauge1, (Float2*)in->V());
568 dslashParam.threads = in->Volume();
570 COVDEV(covDevM, mu, tp.grid, tp.block, tp.shared_bytes, stream, dslashParam, (Float2*)out->V(), (Float2*)gauge0, (Float2*)gauge1, (Float2*)in->V());
574 long long flops()
const {
return 144 * in->Volume(); }
587 void covDev(cudaColorSpinorField *out, cudaGaugeField &gauge,
const cudaColorSpinorField *in,
const int parity,
const int mu, TimeProfile &profile) {
588 DslashCuda *covdev = 0;
589 size_t regSize =
sizeof(float);
593 errorQuda(
"Error: Half precision not supported");
595 if(in->Precision() != gauge.Precision())
596 errorQuda(
"Mixing gauge %d and spinor %d precision not supported", gauge.Precision(), in->Precision());
598 initConstants(gauge, profile);
601 covdev =
new CovDevCuda<float, float4>(out, &gauge, in, parity, mu);
603 #if (__COMPUTE_CAPABILITY__ >= 130)
605 regSize =
sizeof(double);
607 errorQuda(
"Error: Double precision not supported by hardware");
611 covDevCuda(*covdev, regSize, mu, profile);
616 errorQuda(
"Contraction kernels have not been built");
void unbindGaugeTex(const cudaGaugeField &gauge)
int commDimPartitioned(int dir)
QudaVerbosity getVerbosity()
__global__ void const RealA *const const RealA *const const RealA *const const RealB *const const RealB *const int int mu
MsgHandle * comm_declare_send_relative(void *buffer, int dim, int dir, size_t nbytes)
void comm_free(MsgHandle *mh)
texture< int4, 1 > spinorTexDouble
texture< float4, 1, cudaReadModeElementType > spinorTexSingle
FloatingPoint< float > Float
void comm_start(MsgHandle *mh)
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
MsgHandle * comm_declare_receive_relative(void *buffer, int dim, int dir, size_t nbytes)
cpuColorSpinorField * out
void * memset(void *s, int c, size_t n)
void bindGaugeTex(const cudaGaugeField &gauge, const int oddBit, void **gauge0, void **gauge1)
void comm_wait(MsgHandle *mh)
void covDev(cudaColorSpinorField *out, cudaGaugeField &gauge, const cudaColorSpinorField *in, const int parity, const int mu, TimeProfile &profile)