QUDA: quda/include/face_quda.h Source File

QUDA v0.4.0
A library for QCD on GPUs
00001 #ifndef _FACE_QUDA_H
00002 #define _FACE_QUDA_H
00003 
00004 #include <quda_internal.h>
00005 #include <color_spinor_field.h>
00006 
00007 #ifndef MPI_COMMS
00008 
00009 class FaceBuffer {
00010 
00011  private:  
00012   // set these both = 0 `for no overlap of qmp and cudamemcpyasync
00013   // sendBackIdx = 0, and sendFwdIdx = 1 for overlap
00014   int sendBackStrmIdx; // = 0;
00015   int sendFwdStrmIdx; // = 1;
00016   int recFwdStrmIdx; // = sendBackIdx;
00017   int recBackStrmIdx; // = sendFwdIdx;
00018 
00019   // Device memory buffer for coalescing the gathered messages
00020   void *gather_fwd_face;
00021   void *gather_back_face;
00022 
00023   // CUDA pinned memory
00024   void *my_fwd_face[QUDA_MAX_DIM];
00025   void *my_back_face[QUDA_MAX_DIM];
00026   void *from_back_face[QUDA_MAX_DIM];
00027   void *from_fwd_face[QUDA_MAX_DIM];
00028 
00029   // IB pinned memory
00030   void* ib_my_fwd_face[QUDA_MAX_DIM];
00031   void* ib_my_back_face[QUDA_MAX_DIM];  
00032   void* ib_from_back_face[QUDA_MAX_DIM];
00033   void* ib_from_fwd_face[QUDA_MAX_DIM];
00034 
00035   int Ninternal; // number of internal degrees of freedom (12 for spin projected Wilson, 6 for staggered)
00036   QudaPrecision precision;
00037 
00038   int Volume;
00039   int VolumeCB;
00040   int faceVolume[QUDA_MAX_DIM];
00041   int faceVolumeCB[QUDA_MAX_DIM];
00042   int X[QUDA_MAX_DIM];
00043   int nDim;
00044   int nFace;
00045 
00046   size_t nbytes[QUDA_MAX_DIM];
00047 #ifdef QMP_COMMS
00048   QMP_msgmem_t mm_send_fwd[QUDA_MAX_DIM];
00049   QMP_msgmem_t mm_from_fwd[QUDA_MAX_DIM];
00050   QMP_msgmem_t mm_send_back[QUDA_MAX_DIM];
00051   QMP_msgmem_t mm_from_back[QUDA_MAX_DIM];
00052   
00053   QMP_msghandle_t mh_send_fwd[QUDA_MAX_DIM];
00054   QMP_msghandle_t mh_from_fwd[QUDA_MAX_DIM];
00055   QMP_msghandle_t mh_send_back[QUDA_MAX_DIM];
00056   QMP_msghandle_t mh_from_back[QUDA_MAX_DIM];
00057 #endif
00058 
00059   void setupDims(const int *X);
00060  public:
00061   FaceBuffer(const int *X, const int nDim, const int Ninternal,
00062              const int nFace, const QudaPrecision precision);
00063   FaceBuffer(const FaceBuffer &);
00064   virtual ~FaceBuffer();
00065 
00066   void pack(cudaColorSpinorField &in, int parity, int dagger, int dim, cudaStream_t *stream);
00067   void gather(cudaColorSpinorField &in, int dagger, int dir);
00068   void commsStart(int dir);
00069   int  commsQuery(int dir);
00070   void scatter(cudaColorSpinorField &out, int dagger, int dir);
00071 
00072   void exchangeCpuSpinor(cpuColorSpinorField &in, int parity, int dagger);
00073 
00074   void exchangeCpuLink(void** ghost_link, void** link_sendbuf);
00075 };
00076 
00077 void transferGaugeFaces(void *gauge, void *gauge_face, QudaPrecision precision,
00078                         int veclength, QudaReconstructType reconstruct, int V, int Vs);
00079 
00080 #else // MPI comms
00081 
00082 #define XUP 0
00083 #define YUP 1
00084 #define ZUP 2
00085 #define TUP 3
00086 #define TDOWN 4
00087 #define ZDOWN 5
00088 #define YDOWN 6
00089 #define XDOWN 7
00090 
00091 
00092 class FaceBuffer {
00093 
00094  private:
00095   // set these both = 0 `for no overlap of qmp and cudamemcpyasync
00096   // sendBackIdx = 0, and sendFwdIdx = 1 for overlap
00097   int sendBackStrmIdx; // = 0;
00098   int sendFwdStrmIdx; // = 1;
00099   int recFwdStrmIdx; // = sendBackIdx;
00100   int recBackStrmIdx; // = sendFwdIdx;
00101 
00102   int Ninternal; // number of internal degrees of freedom (12 for spin projected Wilson, 6 for staggered)
00103   QudaPrecision precision;
00104   size_t nbytes[QUDA_MAX_DIM];
00105 
00106   int Volume;
00107   int VolumeCB;
00108   int faceVolume[QUDA_MAX_DIM];
00109   int faceVolumeCB[QUDA_MAX_DIM];
00110   int X[QUDA_MAX_DIM];
00111   int nDim;
00112   int nFace;
00113 
00114   void* fwd_nbr_spinor_sendbuf[QUDA_MAX_DIM];
00115   void* back_nbr_spinor_sendbuf[QUDA_MAX_DIM];
00116   
00117   void* fwd_nbr_spinor[QUDA_MAX_DIM];
00118   void* back_nbr_spinor[QUDA_MAX_DIM];
00119 
00120   void* pageable_fwd_nbr_spinor_sendbuf[QUDA_MAX_DIM];
00121   void* pageable_back_nbr_spinor_sendbuf[QUDA_MAX_DIM];
00122   
00123   void* pageable_fwd_nbr_spinor[QUDA_MAX_DIM];
00124   void* pageable_back_nbr_spinor[QUDA_MAX_DIM];
00125   
00126   void* recv_request1[QUDA_MAX_DIM], *recv_request2[QUDA_MAX_DIM];
00127   void* send_request1[QUDA_MAX_DIM], *send_request2[QUDA_MAX_DIM];
00128   
00129   void setupDims(const int *X);
00130   
00131  public:
00132   FaceBuffer(const int *X, const int nDim, const int Ninternal,
00133              const int nFace, const QudaPrecision precision);
00134   FaceBuffer(const FaceBuffer &);
00135   virtual ~FaceBuffer();
00136 
00137   void pack(cudaColorSpinorField &in, int parity, int dagger, int dim, cudaStream_t *stream);
00138   void gather(cudaColorSpinorField &in, int dagger, int dir);
00139   void commsStart(int dir);
00140   int  commsQuery(int dir);
00141   void scatter(cudaColorSpinorField &out, int dagger, int dir);
00142 
00143   void exchangeCpuSpinor(cpuColorSpinorField &in, int parity, int dagger);
00144 
00145   void exchangeCpuLink(void** ghost_link, void** link_sendbuf);
00146 
00147 };
00148 
00149 #ifdef __cplusplus
00150 extern "C" {
00151 #endif
00152   void exchange_cpu_sitelink(int* X,void** sitelink, void** ghost_sitelink,
00153                              void** ghost_sitelink_diag, 
00154                              QudaPrecision gPrecision, QudaGaugeParam* param, int optflag); 
00155   void exchange_cpu_sitelink_ex(int* X, void** sitelink, QudaGaugeFieldOrder cpu_order,
00156                                 QudaPrecision gPrecision, int optflag);
00157   void exchange_gpu_staple_start(int* X, void* _cudaStaple, int dir, int whichway,  cudaStream_t * stream);
00158   void exchange_gpu_staple_comms(int* X, void* _cudaStaple, int dir, int whichway, cudaStream_t * stream);
00159   void exchange_gpu_staple_wait(int* X, void* _cudaStaple, int dir, int whichway, cudaStream_t * stream);
00160   void exchange_gpu_staple(int* X, void* _cudaStaple, cudaStream_t * stream);
00161   void exchange_gpu_staple(int* X, void* _cudaStaple, cudaStream_t * stream);
00162   void exchange_cpu_staple(int* X, void* staple, void** ghost_staple,
00163                            QudaPrecision gPrecision);
00164   void exchange_llfat_init(QudaPrecision prec);
00165   void exchange_llfat_cleanup(void);
00166 
00167 #ifdef __cplusplus
00168 }
00169 #endif
00170 
00171 #endif
00172 
00173  // MPI_COMMS
00174 
00175 #ifdef __cplusplus
00176 extern "C" {
00177 #endif
00178   extern bool globalReduce;
00179 
00180   void reduceMaxDouble(double &);
00181   void reduceDouble(double &);
00182   void reduceDoubleArray(double *, const int len);
00183 
00184   int commDim(int);
00185   int commCoords(int);
00186   int commDimPartitioned(int dir);
00187   void commDimPartitionedSet(int dir);
00188   void commBarrier();
00189 
00190 #ifdef __cplusplus
00191 }
00192 #endif
00193 
00194 #endif // _FACE_QUDA_H