v0.9.0/doc/contract_8cu_source.html

 namespace quda {

   namespace dslash_aux {
     #include <dslash_constants.h>
     #include <dslash_textures.h>
     #include <io_spinor.h>
   }

   using namespace dslash_aux;

 #ifdef GPU_CONTRACT
 #ifdef READ_SPINOR_SINGLE
 #undef READ_SPINOR_SINGLE
 #endif

 #include "contract_core.h"
 #include "contract_core_plus.h"
 #include "contract_core_minus.h"

 #ifndef _TWIST_QUDA_CONTRACT
 #error  "Contraction core undefined"
 #endif

 #ifndef _TWIST_QUDA_CONTRACT_PLUS
 #error  "Contraction core (plus) undefined"
 #endif

 #ifndef _TWIST_QUDA_CONTRACT_MINUS
 #error  "Contraction core (minus) undefined"
 #endif

 #define checkSpinor(a, b)           \
   {                 \
     if  (a.Precision() != b.Precision())        \
       errorQuda("precisions do not match: %d %d", a.Precision(), b.Precision()); \
     if  (a.Length() != b.Length())          \
       errorQuda("lengths do not match: %d %d", a.Length(), b.Length()); \
     if  (a.Stride() != b.Stride())          \
       errorQuda("strides do not match: %d %d", a.Stride(), b.Stride()); \
   }

   template <typename Float2, typename rFloat>
   class ContractCuda : public Tunable {

   private:
     DslashParam dslashParam;
     const cudaColorSpinorField x;   // Spinor to be contracted
     const cudaColorSpinorField y;   // Spinor to be contracted
     const QudaParity parity;    // Parity of the field, actual kernels act on parity spinors
     const QudaContractType contract_type; // Type of contraction, to be detailed later

     void *result;       // The output array with the result of the contraction

     const int nTSlice;      // Time-slice in case of time-dilution

     char aux[16][TuneKey::aux_n];     // For tuning purposes

     unsigned int sharedBytesPerThread() const { return 16*sizeof(rFloat); }
     unsigned int sharedBytesPerBlock(const TuneParam &param) const { return 0; }
     bool tuneGridDim() const { return false; } // Don't tune the grid dimensions.
     unsigned int minThreads() const { return x.X(0) * x.X(1) * x.X(2) * x.X(3); }

     char *saveOut, *saveOutNorm;

     void fillAux(QudaContractType contract_type, const char *contract_str) { strcpy(aux[contract_type], contract_str); }

   public:
     ContractCuda(const cudaColorSpinorField &x, const cudaColorSpinorField &y, void *result, const QudaParity parity, const QudaContractType contract_type) :
       x(x), y(y), result(result), parity(parity), contract_type(contract_type), nTSlice(-1) {
       fillAux(QUDA_CONTRACT, "type=plain");
       fillAux(QUDA_CONTRACT_PLUS, "type=plain-plus");
       fillAux(QUDA_CONTRACT_MINUS, "type=plain-minus");
       fillAux(QUDA_CONTRACT_GAMMA5, "type=gamma5");
       fillAux(QUDA_CONTRACT_GAMMA5_PLUS, "type=gamma5-plus");
       fillAux(QUDA_CONTRACT_GAMMA5_MINUS, "type=gamma5-minus");
       fillAux(QUDA_CONTRACT_TSLICE, "type=tslice");
       fillAux(QUDA_CONTRACT_TSLICE_PLUS, "type=tslice-plus");
       fillAux(QUDA_CONTRACT_TSLICE_MINUS, "type=tslice-minus");

       dslashParam.threads = x.Volume();
       dslashParam.dc = y.getDslashConstant();
       bindSpinorTex<Float2>(&x, &y);
     }

     ContractCuda(const cudaColorSpinorField &x, const cudaColorSpinorField &y, void *result, const QudaParity parity, const QudaContractType contract_type, const int tSlice) :
       x(x), y(y), result(result), parity(parity), contract_type(contract_type), nTSlice(tSlice) {
       fillAux(QUDA_CONTRACT, "type=plain");
       fillAux(QUDA_CONTRACT_PLUS, "type=plain-plus");
       fillAux(QUDA_CONTRACT_MINUS, "type=plain-minus");
       fillAux(QUDA_CONTRACT_GAMMA5, "type=gamma5");
       fillAux(QUDA_CONTRACT_GAMMA5_PLUS, "type=gamma5-plus");
       fillAux(QUDA_CONTRACT_GAMMA5_MINUS, "type=gamma5-minus");
       fillAux(QUDA_CONTRACT_TSLICE, "type=tslice");
       fillAux(QUDA_CONTRACT_TSLICE_PLUS, "type=tslice-plus");
       fillAux(QUDA_CONTRACT_TSLICE_MINUS, "type=tslice-minus");

       DslashParam dslashParam;
       dslashParam.threads = x.X(0)*x.X(1)*x.X(2);
       dslashParam.Vsh = (x.X(0)*x.X(1)*x.X(2)) / x.SiteSubset();
       dslashParam.dc = y.getDslashConstant();
     }

     virtual ~ContractCuda() { unbindSpinorTex<Float2>(&x, &y); } // if (tSlice != NULL) { cudaFreeHost(tSlice); } }

     QudaContractType ContractType() const { return contract_type; }

     TuneKey tuneKey() const
     {
       return TuneKey(x.VolString(), typeid(*this).name(), aux[contract_type]);
     }

     void apply(const cudaStream_t &stream)
     {
       TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity());
       switch (contract_type)
   {
   default:
   case QUDA_CONTRACT_GAMMA5:    // Calculates the volume contraction (x^+ g5)_\mu y_\nu and stores it in result
     contractGamma5Kernel<<<tp.grid, tp.block, tp.shared_bytes>>>((rFloat*)result, (Float2*)x.V(), (Float2*)y.V(), x.Stride(), parity, dslashParam);
     break;

   case QUDA_CONTRACT_GAMMA5_PLUS: // Calculates the volume contraction (x^+ g5)_\mu y_\nu and adds it to result
     contractGamma5PlusKernel<<<tp.grid, tp.block, tp.shared_bytes>>>((rFloat*)result, (Float2*)x.V(), (Float2*)y.V(), x.Stride(), parity, dslashParam);
     break;

   case QUDA_CONTRACT_GAMMA5_MINUS:  // Calculates the volume contraction (x^+ g5)_\mu y_\nu and substracts it from result
     contractGamma5MinusKernel<<<tp.grid, tp.block, tp.shared_bytes>>>((rFloat*)result, (Float2*)x.V(), (Float2*)y.V(), x.Stride(), parity, dslashParam);
     break;

   case QUDA_CONTRACT:     // Calculates the volume contraction x^+_\mu y_\nu and stores it in result
     contractKernel<<<tp.grid, tp.block, tp.shared_bytes>>>((rFloat*)result, (Float2*)x.V(), (Float2*)y.V(), x.Stride(), parity, dslashParam);
     break;

   case QUDA_CONTRACT_PLUS:    // Calculates the volume contraction x^+_\mu y_\nu and adds it to result
     contractPlusKernel<<<tp.grid, tp.block, tp.shared_bytes>>>((rFloat*)result, (Float2*)x.V(), (Float2*)y.V(), x.Stride(), parity, dslashParam);
     break;

   case QUDA_CONTRACT_MINUS:   // Calculates the volume contraction x^+_\mu y_\nu and substracts it from result
     contractMinusKernel<<<tp.grid, tp.block, tp.shared_bytes>>>((rFloat*)result, (Float2*)x.V(), (Float2*)y.V(), x.Stride(), parity, dslashParam);
     break;

   case QUDA_CONTRACT_TSLICE:    // Calculates the time-slice contraction x^+_\mu y_\nu and stores it in result
     contractTsliceKernel<<<tp.grid, tp.block, tp.shared_bytes>>>((rFloat*)result, (Float2*)x.V(), (Float2*)y.V(), x.Stride(), nTSlice, parity, dslashParam);
     break;

   case QUDA_CONTRACT_TSLICE_PLUS: // Calculates the time-slice contraction x^+_\mu y_\nu and adds it to result
     contractTslicePlusKernel<<<tp.grid, tp.block, tp.shared_bytes>>>((rFloat*)result, (Float2*)x.V(), (Float2*)y.V(), x.Stride(), nTSlice, parity, dslashParam);
     break;

   case QUDA_CONTRACT_TSLICE_MINUS:  // Calculates the time-slice contraction x^+_\mu y_\nu and substracts it from result
     contractTsliceMinusKernel<<<tp.grid, tp.block, tp.shared_bytes>>>((rFloat*)result, (Float2*)x.V(), (Float2*)y.V(), x.Stride(), nTSlice, parity, dslashParam);
     break;
   }
     }

     void preTune()      {
       saveOut = new char[dslashParam.threads*sizeof(Float2)*32];
       cudaMemcpy(saveOut, result, dslashParam.threads*sizeof(Float2)*32, cudaMemcpyDeviceToHost);
     }

     void postTune()     {
       cudaMemcpy(result, saveOut, dslashParam.threads*sizeof(Float2)*32, cudaMemcpyHostToDevice);
       delete[] saveOut;
     }

     long long flops() const { return 120ll * x.VolumeCB(); }
     long long bytes() const { return x.Bytes() + x.NormBytes() + y.Bytes() + y.NormBytes(); }
   };
 #endif

   void contractCuda(const cudaColorSpinorField &x, const cudaColorSpinorField &y, void *result, const QudaContractType contract_type, const QudaParity parity, TimeProfile &profile)
   {
 #ifdef GPU_CONTRACT
     if  ((contract_type == QUDA_CONTRACT_TSLICE) || (contract_type == QUDA_CONTRACT_TSLICE_PLUS) || (contract_type == QUDA_CONTRACT_TSLICE_MINUS)) {
       errorQuda("No time-slice specified for contraction\n");
       return;
     }

     profile.TPSTART(QUDA_PROFILE_TOTAL);
     profile.TPSTART(QUDA_PROFILE_INIT);

     Tunable *contract = 0;

     if (x.Precision() == QUDA_DOUBLE_PRECISION) {
       contract = new ContractCuda<double2,double2>(x, y, result, parity, contract_type);
     } else if (x.Precision() == QUDA_SINGLE_PRECISION) {
       contract = new ContractCuda<float4,float2>(x, y, result, parity, contract_type);
     } else if (x.Precision() == QUDA_HALF_PRECISION) {
       errorQuda("Half precision not supported for gamma5 kernel yet");
     }
     profile.TPSTOP(QUDA_PROFILE_INIT);

     profile.TPSTART(QUDA_PROFILE_COMPUTE);
     contract->apply(streams[Nstream-1]);
     qudaStreamSynchronize(streams[Nstream-1]);
     profile.TPSTOP(QUDA_PROFILE_COMPUTE);

     profile.TPSTART(QUDA_PROFILE_EPILOGUE);
     checkCudaError();

     delete contract;

     profile.TPSTOP(QUDA_PROFILE_EPILOGUE);
     profile.TPSTOP(QUDA_PROFILE_TOTAL);
 #else
     errorQuda("Contraction code has not been built");
 #endif
   }

   void contractCuda(const cudaColorSpinorField &x, const cudaColorSpinorField &y, void *result, const QudaContractType contract_type,
         const int nTSlice, const QudaParity parity, TimeProfile &profile)
   {
 #ifdef GPU_CONTRACT
     if  ((contract_type != QUDA_CONTRACT_TSLICE) || (contract_type != QUDA_CONTRACT_TSLICE_PLUS) || (contract_type != QUDA_CONTRACT_TSLICE_MINUS)) {
       errorQuda("No time-slice input allowed for volume contractions\n");
       return;
     }

     profile.TPSTART(QUDA_PROFILE_TOTAL);
     profile.TPSTART(QUDA_PROFILE_INIT);

     Tunable *contract = 0;

     if (x.Precision() == QUDA_DOUBLE_PRECISION) {
       contract = new ContractCuda<double2,double2>(x, y, result, parity, contract_type, nTSlice);
     } else if (x.Precision() == QUDA_SINGLE_PRECISION) {
       contract = new ContractCuda<float4,float2>(x, y, result, parity, contract_type, nTSlice);
     } else if (x.Precision() == QUDA_HALF_PRECISION) {
       errorQuda("Half precision not supported for gamma5 kernel yet");
     }
     profile.TPSTOP(QUDA_PROFILE_INIT);

     profile.TPSTART(QUDA_PROFILE_COMPUTE);
     contract->apply(streams[Nstream-1]);
     qudaStreamSynchronize(streams[Nstream-1]);
     profile.TPSTOP(QUDA_PROFILE_COMPUTE);

     profile.TPSTART(QUDA_PROFILE_EPILOGUE);
     checkCudaError();
     delete contract;

     profile.TPSTOP(QUDA_PROFILE_EPILOGUE);
     profile.TPSTOP(QUDA_PROFILE_TOTAL);
 #else
     errorQuda("Contraction code has not been built");
 #endif
   }

 } // namespace quda

contract
void contract(const cudaColorSpinorField x, const cudaColorSpinorField y, void *ctrn, const QudaContractType cType)
Definition: interface_quda.cpp:5369

contractKernel
__global__ void contractKernel(double2 *out, double2 *in1, double2 *in2, int myStride, const int Parity, const DslashParam param)
Definition: contract_core.h:594

QUDA_CONTRACT_TSLICE_PLUS
Definition: enum_quda.h:456

contractTslicePlusKernel
__global__ void contractTslicePlusKernel(double2 *out, double2 *in1, double2 *in2, int myStride, const int Tslice, const int Parity, const DslashParam param)
Definition: contract_core_plus.h:353

getVerbosity
QudaVerbosity getVerbosity()
Definition: util_quda.cpp:20

errorQuda
#define errorQuda(...)
Definition: util_quda.h:90

contract_core.h

QUDA_HALF_PRECISION
Definition: enum_quda.h:59

streams
cudaStream_t * streams
Definition: interface_quda.cpp:153

quda::stream
cudaStream_t * stream
Definition: cuda_color_spinor_field.cu:898

QUDA_CONTRACT_GAMMA5_PLUS
Definition: enum_quda.h:453

dslash_constants.h

quda::Nstream
const int Nstream
Definition: quda_internal.h:330

strcpy
char * strcpy(char *__dst, const char *__src)

dslash_textures.h

QUDA_CONTRACT_GAMMA5
Definition: enum_quda.h:452

contractGamma5MinusKernel
__global__ void contractGamma5MinusKernel(double2 *out, double2 *in1, double2 *in2, int myStride, const int Parity, const DslashParam param)
Definition: contract_core_minus.h:58

quda::cudaColorSpinorField
Definition: color_spinor_field.h:504

QUDA_CONTRACT_TSLICE
Definition: enum_quda.h:455

contract_core_minus.h

DslashParam::Vsh
int Vsh
Definition: dslash_constants.h:131

quda
Definition: blas_cublas.h:6

param
QudaGaugeParam param
Definition: pack_test.cpp:17

x
p x
Definition: CMakeCUDACompilerId.cpp1.ii:3011

contract_core_plus.h

quda::Tunable
Definition: tune_quda.h:60

quda::qudaStreamSynchronize
cudaError_t qudaStreamSynchronize(cudaStream_t &stream)
Wrapper around cudaStreamSynchronize or cuStreamSynchronize.
Definition: quda_cuda_api.cpp:243

DslashParam::dc
DslashConstant dc
Definition: dslash_constants.h:23

quda::QUDA_PROFILE_EPILOGUE
Definition: quda_internal.h:174

QUDA_CONTRACT
Definition: enum_quda.h:449

QUDA_CONTRACT_TSLICE_MINUS
Definition: enum_quda.h:457

quda::QUDA_PROFILE_COMPUTE
Definition: quda_internal.h:172

QUDA_CONTRACT_MINUS
Definition: enum_quda.h:451

quda::tuneLaunch
TuneParam & tuneLaunch(Tunable &tunable, QudaTune enabled, QudaVerbosity verbosity)
Definition: tune.cpp:603

contractMinusKernel
__global__ void contractMinusKernel(double2 *out, double2 *in1, double2 *in2, int myStride, const int Parity, const DslashParam param)
Definition: contract_core_minus.h:621

io_spinor.h

contractTsliceKernel
__global__ void contractTsliceKernel(double2 *out, double2 *in1, double2 *in2, int myStride, const int Tslice, const int Parity, const DslashParam param)
Definition: contract_core.h:343

QudaParity
enum QudaParity_s QudaParity

contractGamma5PlusKernel
__global__ void contractGamma5PlusKernel(double2 *out, double2 *in1, double2 *in2, int myStride, const int Parity, const DslashParam param)
Definition: contract_core_plus.h:58

quda::QUDA_PROFILE_INIT
Definition: quda_internal.h:170

QUDA_CONTRACT_GAMMA5_MINUS
Definition: enum_quda.h:454

QUDA_DOUBLE_PRECISION
Definition: enum_quda.h:61

QUDA_SINGLE_PRECISION
Definition: enum_quda.h:60

quda::TuneKey::aux_n
static const int aux_n
Definition: tune_key.h:12

quda::QUDA_PROFILE_TOTAL
Definition: quda_internal.h:205

quda::contractCuda
void contractCuda(const cudaColorSpinorField &x, const cudaColorSpinorField &y, void *result, const QudaContractType contract_type, const QudaParity parity, TimeProfile &profile)
Definition: contract.cu:202

quda::blas::flops
unsigned long long flops
Definition: blas_quda.cu:42

contractPlusKernel
__global__ void contractPlusKernel(double2 *out, double2 *in1, double2 *in2, int myStride, const int Parity, const DslashParam param)
Definition: contract_core_plus.h:621

quda::TimeProfile
Definition: quda_internal.h:232

QudaContractType
enum QudaContractType_s QudaContractType

checkCudaError
#define checkCudaError()
Definition: util_quda.h:129

contractGamma5Kernel
__global__ void contractGamma5Kernel(double2 *out, double2 *in1, double2 *in2, int myStride, const int Parity, const DslashParam param)
Definition: contract_core.h:65

getTuning
QudaTune getTuning()
Query whether autotuning is enabled or not. Default is enabled but can be overridden by setting QUDA_...
Definition: util_quda.cpp:51

contractTsliceMinusKernel
__global__ void contractTsliceMinusKernel(double2 *out, double2 *in1, double2 *in2, int myStride, const int Tslice, const int Parity, const DslashParam param)
Definition: contract_core_minus.h:353

DslashParam::threads
int threads
Definition: dslash_constants.h:16

DslashParam
Definition: dslash_constants.h:15

parity
QudaParity parity
Definition: covdev_test.cpp:53

QUDA_CONTRACT_PLUS
Definition: enum_quda.h:450

quda::blas::bytes
unsigned long long bytes
Definition: blas_quda.cu:43

y
int y
Definition: CMakeCUDACompilerId.cpp1.ii:2637