QUDA v0.4.0
A library for QCD on GPUs
|
00001 #ifndef _FACE_QUDA_H 00002 #define _FACE_QUDA_H 00003 00004 #include <quda_internal.h> 00005 #include <color_spinor_field.h> 00006 00007 #ifndef MPI_COMMS 00008 00009 class FaceBuffer { 00010 00011 private: 00012 // set these both = 0 `for no overlap of qmp and cudamemcpyasync 00013 // sendBackIdx = 0, and sendFwdIdx = 1 for overlap 00014 int sendBackStrmIdx; // = 0; 00015 int sendFwdStrmIdx; // = 1; 00016 int recFwdStrmIdx; // = sendBackIdx; 00017 int recBackStrmIdx; // = sendFwdIdx; 00018 00019 // Device memory buffer for coalescing the gathered messages 00020 void *gather_fwd_face; 00021 void *gather_back_face; 00022 00023 // CUDA pinned memory 00024 void *my_fwd_face[QUDA_MAX_DIM]; 00025 void *my_back_face[QUDA_MAX_DIM]; 00026 void *from_back_face[QUDA_MAX_DIM]; 00027 void *from_fwd_face[QUDA_MAX_DIM]; 00028 00029 // IB pinned memory 00030 void* ib_my_fwd_face[QUDA_MAX_DIM]; 00031 void* ib_my_back_face[QUDA_MAX_DIM]; 00032 void* ib_from_back_face[QUDA_MAX_DIM]; 00033 void* ib_from_fwd_face[QUDA_MAX_DIM]; 00034 00035 int Ninternal; // number of internal degrees of freedom (12 for spin projected Wilson, 6 for staggered) 00036 QudaPrecision precision; 00037 00038 int Volume; 00039 int VolumeCB; 00040 int faceVolume[QUDA_MAX_DIM]; 00041 int faceVolumeCB[QUDA_MAX_DIM]; 00042 int X[QUDA_MAX_DIM]; 00043 int nDim; 00044 int nFace; 00045 00046 size_t nbytes[QUDA_MAX_DIM]; 00047 #ifdef QMP_COMMS 00048 QMP_msgmem_t mm_send_fwd[QUDA_MAX_DIM]; 00049 QMP_msgmem_t mm_from_fwd[QUDA_MAX_DIM]; 00050 QMP_msgmem_t mm_send_back[QUDA_MAX_DIM]; 00051 QMP_msgmem_t mm_from_back[QUDA_MAX_DIM]; 00052 00053 QMP_msghandle_t mh_send_fwd[QUDA_MAX_DIM]; 00054 QMP_msghandle_t mh_from_fwd[QUDA_MAX_DIM]; 00055 QMP_msghandle_t mh_send_back[QUDA_MAX_DIM]; 00056 QMP_msghandle_t mh_from_back[QUDA_MAX_DIM]; 00057 #endif 00058 00059 void setupDims(const int *X); 00060 public: 00061 FaceBuffer(const int *X, const int nDim, const int Ninternal, 00062 const int nFace, const QudaPrecision precision); 00063 FaceBuffer(const FaceBuffer &); 00064 virtual ~FaceBuffer(); 00065 00066 void pack(cudaColorSpinorField &in, int parity, int dagger, int dim, cudaStream_t *stream); 00067 void gather(cudaColorSpinorField &in, int dagger, int dir); 00068 void commsStart(int dir); 00069 int commsQuery(int dir); 00070 void scatter(cudaColorSpinorField &out, int dagger, int dir); 00071 00072 void exchangeCpuSpinor(cpuColorSpinorField &in, int parity, int dagger); 00073 00074 void exchangeCpuLink(void** ghost_link, void** link_sendbuf); 00075 }; 00076 00077 void transferGaugeFaces(void *gauge, void *gauge_face, QudaPrecision precision, 00078 int veclength, QudaReconstructType reconstruct, int V, int Vs); 00079 00080 #else // MPI comms 00081 00082 #define XUP 0 00083 #define YUP 1 00084 #define ZUP 2 00085 #define TUP 3 00086 #define TDOWN 4 00087 #define ZDOWN 5 00088 #define YDOWN 6 00089 #define XDOWN 7 00090 00091 00092 class FaceBuffer { 00093 00094 private: 00095 // set these both = 0 `for no overlap of qmp and cudamemcpyasync 00096 // sendBackIdx = 0, and sendFwdIdx = 1 for overlap 00097 int sendBackStrmIdx; // = 0; 00098 int sendFwdStrmIdx; // = 1; 00099 int recFwdStrmIdx; // = sendBackIdx; 00100 int recBackStrmIdx; // = sendFwdIdx; 00101 00102 int Ninternal; // number of internal degrees of freedom (12 for spin projected Wilson, 6 for staggered) 00103 QudaPrecision precision; 00104 size_t nbytes[QUDA_MAX_DIM]; 00105 00106 int Volume; 00107 int VolumeCB; 00108 int faceVolume[QUDA_MAX_DIM]; 00109 int faceVolumeCB[QUDA_MAX_DIM]; 00110 int X[QUDA_MAX_DIM]; 00111 int nDim; 00112 int nFace; 00113 00114 void* fwd_nbr_spinor_sendbuf[QUDA_MAX_DIM]; 00115 void* back_nbr_spinor_sendbuf[QUDA_MAX_DIM]; 00116 00117 void* fwd_nbr_spinor[QUDA_MAX_DIM]; 00118 void* back_nbr_spinor[QUDA_MAX_DIM]; 00119 00120 void* pageable_fwd_nbr_spinor_sendbuf[QUDA_MAX_DIM]; 00121 void* pageable_back_nbr_spinor_sendbuf[QUDA_MAX_DIM]; 00122 00123 void* pageable_fwd_nbr_spinor[QUDA_MAX_DIM]; 00124 void* pageable_back_nbr_spinor[QUDA_MAX_DIM]; 00125 00126 void* recv_request1[QUDA_MAX_DIM], *recv_request2[QUDA_MAX_DIM]; 00127 void* send_request1[QUDA_MAX_DIM], *send_request2[QUDA_MAX_DIM]; 00128 00129 void setupDims(const int *X); 00130 00131 public: 00132 FaceBuffer(const int *X, const int nDim, const int Ninternal, 00133 const int nFace, const QudaPrecision precision); 00134 FaceBuffer(const FaceBuffer &); 00135 virtual ~FaceBuffer(); 00136 00137 void pack(cudaColorSpinorField &in, int parity, int dagger, int dim, cudaStream_t *stream); 00138 void gather(cudaColorSpinorField &in, int dagger, int dir); 00139 void commsStart(int dir); 00140 int commsQuery(int dir); 00141 void scatter(cudaColorSpinorField &out, int dagger, int dir); 00142 00143 void exchangeCpuSpinor(cpuColorSpinorField &in, int parity, int dagger); 00144 00145 void exchangeCpuLink(void** ghost_link, void** link_sendbuf); 00146 00147 }; 00148 00149 #ifdef __cplusplus 00150 extern "C" { 00151 #endif 00152 void exchange_cpu_sitelink(int* X,void** sitelink, void** ghost_sitelink, 00153 void** ghost_sitelink_diag, 00154 QudaPrecision gPrecision, QudaGaugeParam* param, int optflag); 00155 void exchange_cpu_sitelink_ex(int* X, void** sitelink, QudaGaugeFieldOrder cpu_order, 00156 QudaPrecision gPrecision, int optflag); 00157 void exchange_gpu_staple_start(int* X, void* _cudaStaple, int dir, int whichway, cudaStream_t * stream); 00158 void exchange_gpu_staple_comms(int* X, void* _cudaStaple, int dir, int whichway, cudaStream_t * stream); 00159 void exchange_gpu_staple_wait(int* X, void* _cudaStaple, int dir, int whichway, cudaStream_t * stream); 00160 void exchange_gpu_staple(int* X, void* _cudaStaple, cudaStream_t * stream); 00161 void exchange_gpu_staple(int* X, void* _cudaStaple, cudaStream_t * stream); 00162 void exchange_cpu_staple(int* X, void* staple, void** ghost_staple, 00163 QudaPrecision gPrecision); 00164 void exchange_llfat_init(QudaPrecision prec); 00165 void exchange_llfat_cleanup(void); 00166 00167 #ifdef __cplusplus 00168 } 00169 #endif 00170 00171 #endif 00172 00173 // MPI_COMMS 00174 00175 #ifdef __cplusplus 00176 extern "C" { 00177 #endif 00178 extern bool globalReduce; 00179 00180 void reduceMaxDouble(double &); 00181 void reduceDouble(double &); 00182 void reduceDoubleArray(double *, const int len); 00183 00184 int commDim(int); 00185 int commCoords(int); 00186 int commDimPartitioned(int dir); 00187 void commDimPartitionedSet(int dir); 00188 void commBarrier(); 00189 00190 #ifdef __cplusplus 00191 } 00192 #endif 00193 00194 #endif // _FACE_QUDA_H