15 #define TDIFF(a,b) (b.tv_sec - a.tv_sec + 0.000001*(b.tv_usec - a.tv_usec)) 19 extern void usage(
char** argv);
80 int linksize =
prec*recon;
83 int matrix_mul_flops = 198;
84 int matrix_add_flops = 18;
86 int num_calls_middle_link[6] = {24, 24, 96, 96, 24, 24};
87 int middle_link_data_io[6][2] = {
95 int middle_link_data_flops[6][2] = {
104 int num_calls_side_link[2]= {192, 48};
105 int side_link_data_io[2][2] = {
109 int side_link_data_flops[2][2] = {
114 int num_calls_all_link[2] ={192, 192};
115 int all_link_data_io[2][2] = {
119 int all_link_data_flops[2][2] = {
125 for(
int i = 0;
i < 6;
i++){
126 total_io += num_calls_middle_link[
i]
127 *(middle_link_data_io[
i][0]*linksize + middle_link_data_io[
i][1]*cmsize);
130 for(
int i = 0;
i < 2;
i++){
131 total_io += num_calls_side_link[
i]
132 *(side_link_data_io[
i][0]*linksize + side_link_data_io[
i][1]*cmsize);
134 for(
int i = 0;
i < 2;
i++){
135 total_io += num_calls_all_link[
i]
136 *(all_link_data_io[
i][0]*linksize + all_link_data_io[
i][1]*cmsize);
141 double total_flops = 0;
142 for(
int i = 0;
i < 6;
i++){
143 total_flops += num_calls_middle_link[
i]
144 *(middle_link_data_flops[
i][0]*matrix_mul_flops + middle_link_data_flops[
i][1]*matrix_add_flops);
147 for(
int i = 0;
i < 2;
i++){
148 total_flops += num_calls_side_link[
i]
149 *(side_link_data_flops[
i][0]*matrix_mul_flops + side_link_data_flops[
i][1]*matrix_add_flops);
151 for(
int i = 0;
i < 2;
i++){
152 total_flops += num_calls_all_link[
i]
153 *(all_link_data_flops[
i][0]*matrix_mul_flops + all_link_data_flops[
i][1]*matrix_add_flops);
158 *
flops = total_flops;
160 printfQuda(
"flop/byte =%.1f\n", total_flops/total_io);
165 static int R[4] = {2, 2, 2, 2};
167 static int R[4] = {0, 0, 0, 0};
250 fprintf(stderr,
"ERROR: malloc failed for hw\n");
314 float act_path_coeff[6];
316 act_path_coeff[0] = 0.625000;
317 act_path_coeff[1] = -0.058479;
318 act_path_coeff[2] = -0.087719;
319 act_path_coeff[3] = 0.030778;
320 act_path_coeff[4] = -0.007200;
321 act_path_coeff[5] = -0.123113;
324 double d_act_path_coeff[6];
325 for(
int i=0;
i<6; ++
i){
326 d_act_path_coeff[
i] = act_path_coeff[
i];
340 gettimeofday(&ht0, NULL);
346 gettimeofday(&ht1, NULL);
350 gettimeofday(&t0, NULL);
353 cudaDeviceSynchronize();
354 gettimeofday(&t1, NULL);
363 cudaDeviceSynchronize();
365 gettimeofday(&t2, NULL);
379 cudaDeviceSynchronize();
380 gettimeofday(&t3, NULL);
386 int accuracy_level = 3;
392 printfQuda(
"Test %s\n",(1 == res) ?
"PASSED" :
"FAILED");
398 float perf_flops = total_flops / (
TDIFF(t0, t1)) *1
e-9;
399 float perf = total_io / (
TDIFF(t0, t1)) *1
e-9;
400 printfQuda(
"Staples time: %.2f ms, perf = %.2f GFLOPS, achieved bandwidth= %.2f GB/s\n",
TDIFF(t0,t1)*1000, perf_flops, perf);
401 printfQuda(
"Staples time : %g ms\t LongLink time : %g ms\t Completion time : %g ms\n",
TDIFF(t0,t1)*1000,
TDIFF(t1,t2)*1000,
TDIFF(t2,t3)*1000);
402 printfQuda(
"Host time (half-wilson fermion force) : %g ms\n",
TDIFF(ht0, ht1)*1000);
406 return accuracy_level;
412 printfQuda(
"running the following fermion force computation test:\n");
414 printfQuda(
"link_precision link_reconstruct space_dim(x/y/z) T_dimension Gauge_order\n");
424 int main(
int argc,
char **argv)
427 for (
i =1;
i < argc;
i++){
433 if(
strcmp(argv[
i],
"--gauge-order") == 0){
438 if(
strcmp(argv[
i+1],
"milc") == 0){
440 }
else if(
strcmp(argv[
i+1],
"qdp") == 0){
443 fprintf(stderr,
"Error: unsupported gauge-field order\n");
450 fprintf(stderr,
"ERROR: Invalid option:%s\n", argv[
i]);
455 errorQuda(
"Multi-gpu for milc order is not supported\n");
468 if(accuracy_level >=3 ){
int main(int argc, char **argv)
QudaGhostExchange ghostExchange
cpuGaugeField * cpuForce_ex
enum QudaPrecision_s QudaPrecision
void saveCPUField(cpuGaugeField &cpu) const
Upload from this field into a CPU field.
cudaGaugeField * cudaOprod_ex
void exchangeExtendedGhost(const int *R, bool no_comms_fill=false)
This does routine will populate the border / halo region of a gauge field that has been created using...
cudaGaugeField * cudaForce_ex
void hisqStaplesForceCPU(const double *path_coeff, const QudaGaugeParam ¶m, quda::cpuGaugeField &oprod, quda::cpuGaugeField &link, quda::cpuGaugeField *newOprod)
void createHwCPU(void *hw, QudaPrecision precision)
int process_command_line_option(int argc, char **argv, int *idx)
int gridsize_from_cmdline[]
static void setPrecision(QudaPrecision precision)
const char * get_gauge_order_str(QudaGaugeFieldOrder order)
int compare_floats(void *a, void *b, int len, double epsilon, QudaPrecision precision)
const char * get_prec_str(QudaPrecision prec)
void createSiteLinkCPU(void **link, QudaPrecision precision, int phase)
QudaPrecision cpu_hw_prec
void exit(int) __attribute__((noreturn))
static void hisq_force_init()
else return(__swbuf(_c, _p))
static void display_test_info()
void hisqCompleteForceCPU(const QudaGaugeParam ¶m, quda::cpuGaugeField &oprod, quda::cpuGaugeField &link, quda::cpuGaugeField *mom)
int strcmp(const char *__s1, const char *__s2)
void loadCPUField(const cpuGaugeField &cpu)
Download into this field from a CPU field.
void initQuda(int device)
void hisqLongLinkForce(GaugeField &newOprod, const GaugeField &oprod, const GaugeField &link, double coeff, long long *flops=nullptr)
Compute the long-link contribution to the fermion force.
void * malloc(size_t __size) __attribute__((__warn_unused_result__)) __attribute__((alloc_size(1)))
void hisqStaplesForce(GaugeField &newOprod, const GaugeField &oprod, const GaugeField &link, const double path_coeff[6], long long *flops=nullptr)
Compute the fat-link contribution to the fermion force.
cudaGaugeField * cudaLongLinkOprod
void exchangeExtendedGhost(const int *R, bool no_comms_fill=false)
This does routine will populate the border / halo region of a gauge field that has been created using...
QudaReconstructType link_recon
const char * get_recon_str(QudaReconstructType recon)
cpuGaugeField * cpuOprod_ex
static QudaGaugeParam qudaGaugeParam
QudaGaugeFieldOrder order
enum QudaGaugeFieldOrder_s QudaGaugeFieldOrder
static void computeLinkOrderedOuterProduct(half_wilson_vector *src, su3_matrix *dest, int gauge_order)
void hisqLongLinkForceCPU(double coeff, const QudaGaugeParam ¶m, quda::cpuGaugeField &oprod, quda::cpuGaugeField &link, quda::cpuGaugeField *newOprod)
QudaReconstructType reconstruct
int fprintf(FILE *, const char *,...) __attribute__((__format__(__printf__
int strong_check_mom(void *momA, void *momB, int len, QudaPrecision prec)
void * memcpy(void *__dst, const void *__src, size_t __n)
static QudaGaugeParam qudaGaugeParam_ex
GaugeFieldParam gParam_ex
cpuGaugeField * cpuLongLinkOprod_ex
cudaGaugeField * cudaGauge_ex
void hisqCompleteForce(GaugeField &momentum, const GaugeField &oprod, const GaugeField &link, long long *flops=nullptr)
Multiply the computed the force matrix by the gauge field and perform traceless anti-hermitian projec...
cpuGaugeField * cpuLongLinkOprod
enum QudaReconstructType_s QudaReconstructType
Main header file for the QUDA library.
static int hisq_force_test(void)
static void hisq_force_end()
cpuGaugeField * cpuGauge_ex
cudaGaugeField * cudaGauge
QudaGaugeFieldOrder gauge_order
QudaReconstructType reconstruct
cudaGaugeField * cudaForce
cudaGaugeField * cudaOprod
static __inline__ size_t size_t d
cudaGaugeField * cudaLongLinkOprod_ex
void initComms(int argc, char **argv, const int *commDims)
void copyExtendedGauge(GaugeField &out, const GaugeField &in, QudaFieldLocation location, void *Out=0, void *In=0)
void setVerbosity(const QudaVerbosity verbosity)
int comm_dim_partitioned(int dim)
void total_staple_io_flops(QudaPrecision prec, QudaReconstructType recon, double *io, double *flops)