16 #define TDIFF(a,b) (b.tv_sec - a.tv_sec + 0.000001*(b.tv_usec - a.tv_usec))
74 int linksize =
prec*recon;
77 int matrix_mul_flops = 198;
78 int matrix_add_flops = 18;
80 int num_calls_middle_link[6] = {24, 24, 96, 96, 24, 24};
81 int middle_link_data_io[6][2] = {
89 int middle_link_data_flops[6][2] = {
98 int num_calls_side_link[2]= {192, 48};
99 int side_link_data_io[2][2] = {
103 int side_link_data_flops[2][2] = {
108 int num_calls_all_link[2] ={192, 192};
109 int all_link_data_io[2][2] = {
113 int all_link_data_flops[2][2] = {
119 for(
int i = 0;i < 6; i++){
120 total_io += num_calls_middle_link[i]
121 *(middle_link_data_io[i][0]*linksize + middle_link_data_io[i][1]*cmsize);
124 for(
int i = 0;i < 2; i++){
125 total_io += num_calls_side_link[i]
126 *(side_link_data_io[i][0]*linksize + side_link_data_io[i][1]*cmsize);
128 for(
int i = 0;i < 2; i++){
129 total_io += num_calls_all_link[i]
130 *(all_link_data_io[i][0]*linksize + all_link_data_io[i][1]*cmsize);
135 double total_flops = 0;
136 for(
int i = 0;i < 6; i++){
137 total_flops += num_calls_middle_link[i]
138 *(middle_link_data_flops[i][0]*matrix_mul_flops + middle_link_data_flops[i][1]*matrix_add_flops);
141 for(
int i = 0;i < 2; i++){
142 total_flops += num_calls_side_link[i]
143 *(side_link_data_flops[i][0]*matrix_mul_flops + side_link_data_flops[i][1]*matrix_add_flops);
145 for(
int i = 0;i < 2; i++){
146 total_flops += num_calls_all_link[i]
147 *(all_link_data_flops[i][0]*matrix_mul_flops + all_link_data_flops[i][1]*matrix_add_flops);
152 *
flops = total_flops;
154 printfQuda(
"flop/byte =%.1f\n", total_flops/total_io);
159 static int R[4] = {2, 2, 2, 2};
161 static int R[4] = {0, 0, 0, 0};
166 static void hisq_force_init()
170 qudaGaugeParam.
X[0] =
xdim;
171 qudaGaugeParam.
X[1] =
ydim;
172 qudaGaugeParam.
X[2] =
zdim;
173 qudaGaugeParam.
X[3] =
tdim;
182 memcpy(&qudaGaugeParam_ex, &qudaGaugeParam,
sizeof(
QudaGaugeParam));
244 fprintf(stderr,
"ERROR: malloc failed for hw\n");
276 static void hisq_force_end()
301 static int hisq_force_test(
void)
308 float act_path_coeff[6];
310 act_path_coeff[0] = 0.625000;
311 act_path_coeff[1] = -0.058479;
312 act_path_coeff[2] = -0.087719;
313 act_path_coeff[3] = 0.030778;
314 act_path_coeff[4] = -0.007200;
315 act_path_coeff[5] = -0.123113;
318 double d_act_path_coeff[6];
319 for(
int i=0; i<6; ++i){
320 d_act_path_coeff[i] = act_path_coeff[i];
333 struct timeval ht0, ht1;
334 gettimeofday(&ht0, NULL);
340 gettimeofday(&ht1, NULL);
342 struct timeval t0, t1, t2, t3;
344 gettimeofday(&t0, NULL);
348 gettimeofday(&t1, NULL);
359 gettimeofday(&t2, NULL);
375 gettimeofday(&t3, NULL);
379 int accuracy_level = 3;
386 printfQuda(
"Test %s\n",(1 == res) ?
"PASSED" :
"FAILED");
392 float perf_flops = total_flops / (
TDIFF(t0, t1)) *1e-9;
393 float perf = total_io / (
TDIFF(t0, t1)) *1e-9;
394 printfQuda(
"Staples time: %.2f ms, perf = %.2f GFLOPS, achieved bandwidth= %.2f GB/s\n",
TDIFF(t0,t1)*1000, perf_flops, perf);
395 printfQuda(
"Staples time : %g ms\t LongLink time : %g ms\t Completion time : %g ms\n",
TDIFF(t0,t1)*1000,
TDIFF(t1,t2)*1000,
TDIFF(t2,t3)*1000);
396 printfQuda(
"Host time (half-wilson fermion force) : %g ms\n",
TDIFF(ht0, ht1)*1000);
400 return accuracy_level;
406 printfQuda(
"running the following fermion force computation test:\n");
408 printfQuda(
"link_precision link_reconstruct space_dim(x/y/z) T_dimension Gauge_order\n");
418 int main(
int argc,
char **argv)
428 app->add_option(
"--gauge-order",
gauge_order,
"")->transform(CLI::QUDACheckedTransformer(gauge_order_map));
431 app->parse(argc, argv);
432 }
catch (
const CLI::ParseError &e) {
437 errorQuda(
"Multi-gpu for milc order is not supported\n");
446 int accuracy_level = hisq_force_test();
450 if(accuracy_level >=3 ){
void exchangeExtendedGhost(const int *R, bool no_comms_fill=false)
This does routine will populate the border / halo region of a gauge field that has been created using...
void loadCPUField(const cpuGaugeField &cpu)
Download into this field from a CPU field.
void saveCPUField(cpuGaugeField &cpu) const
Upload from this field into a CPU field.
void exchangeExtendedGhost(const int *R, bool no_comms_fill=false)
This does routine will populate the border / halo region of a gauge field that has been created using...
int comm_dim_partitioned(int dim)
std::shared_ptr< QUDAApp > make_app(std::string app_description, std::string app_name)
QudaReconstructType link_recon
std::array< int, 4 > gridsize_from_cmdline
enum QudaPrecision_s QudaPrecision
@ QUDA_CPU_FIELD_LOCATION
enum QudaGaugeFieldOrder_s QudaGaugeFieldOrder
@ QUDA_GHOST_EXCHANGE_EXTENDED
enum QudaReconstructType_s QudaReconstructType
@ QUDA_FLOAT2_GAUGE_ORDER
void computeLinkOrderedOuterProduct(void *src, void *dst, QudaPrecision precision, int gauge_order)
void hisqStaplesForceCPU(const double *path_coeff, const QudaGaugeParam ¶m, quda::cpuGaugeField &oprod, quda::cpuGaugeField &link, quda::cpuGaugeField *newOprod)
void hisqCompleteForceCPU(const QudaGaugeParam ¶m, quda::cpuGaugeField &oprod, quda::cpuGaugeField &link, quda::cpuGaugeField *mom)
void hisqLongLinkForceCPU(double coeff, const QudaGaugeParam ¶m, quda::cpuGaugeField &oprod, quda::cpuGaugeField &link, quda::cpuGaugeField *newOprod)
cudaGaugeField * cudaLongLinkOprod
QudaGaugeFieldOrder gauge_order
cpuGaugeField * cpuLongLinkOprod_ex
void total_staple_io_flops(QudaPrecision prec, QudaReconstructType recon, double *io, double *flops)
int main(int argc, char **argv)
cudaGaugeField * cudaLongLinkOprod_ex
cpuGaugeField * cpuForce_ex
cudaGaugeField * cudaOprod
cpuGaugeField * cpuOprod_ex
cudaGaugeField * cudaOprod_ex
GaugeFieldParam gParam_ex
cudaGaugeField * cudaGauge_ex
cpuGaugeField * cpuLongLinkOprod
QudaPrecision cpu_hw_prec
cudaGaugeField * cudaForce
cudaGaugeField * cudaGauge
cpuGaugeField * cpuGauge_ex
cudaGaugeField * cudaForce_ex
void setPrecision(QudaPrecision precision)
int compare_floats(void *a, void *b, int len, double epsilon, QudaPrecision precision)
void initComms(int argc, char **argv, std::array< int, 4 > &commDims)
int strong_check_mom(void *momA, void *momB, int len, QudaPrecision prec)
void createSiteLinkCPU(void **link, QudaPrecision precision, int phase)
void createHwCPU(void *hw, QudaPrecision precision)
const char * get_prec_str(QudaPrecision prec)
const char * get_gauge_order_str(QudaGaugeFieldOrder order)
const char * get_recon_str(QudaReconstructType recon)
void hisqCompleteForce(GaugeField &oprod, const GaugeField &link)
Multiply the computed the force matrix by the gauge field and perform traceless anti-hermitian projec...
void hisqLongLinkForce(GaugeField &newOprod, const GaugeField &oprod, const GaugeField &link, double coeff)
Compute the long-link contribution to the fermion force.
void hisqStaplesForce(GaugeField &newOprod, const GaugeField &oprod, const GaugeField &link, const double path_coeff[6])
Compute the fat-link contribution to the fermion force.
void updateMomentum(GaugeField &mom, double coeff, GaugeField &force, const char *fname)
void copyExtendedGauge(GaugeField &out, const GaugeField &in, QudaFieldLocation location, void *Out=0, void *In=0)
Main header file for the QUDA library.
void initQuda(int device)
#define qudaDeviceSynchronize()
QudaReconstructType reconstruct
QudaReconstructType reconstruct
QudaGaugeFieldOrder order
void setPrecision(QudaPrecision precision, bool force_native=false)
Helper function for setting the precision and corresponding field order for QUDA internal fields.
QudaGhostExchange ghostExchange
void setVerbosity(QudaVerbosity verbosity)