18 #define TDIFF(a,b) (b.tv_sec - a.tv_sec + 0.000001*(b.tv_usec - a.tv_usec))
23 extern void usage(
char** argv);
93 int linksize = prec*recon;
96 int matrix_mul_flops = 198;
97 int matrix_add_flops = 18;
99 int num_calls_middle_link[6] = {24, 24, 96, 96, 24, 24};
100 int middle_link_data_io[6][2] = {
108 int middle_link_data_flops[6][2] = {
118 int num_calls_side_link[2]= {192, 48};
119 int side_link_data_io[2][2] = {
123 int side_link_data_flops[2][2] = {
130 int num_calls_all_link[2] ={192, 192};
131 int all_link_data_io[2][2] = {
135 int all_link_data_flops[2][2] = {
142 for(
int i = 0;i < 6; i++){
143 total_io += num_calls_middle_link[i]
144 *(middle_link_data_io[i][0]*linksize + middle_link_data_io[i][1]*cmsize);
147 for(
int i = 0;i < 2; i++){
148 total_io += num_calls_side_link[i]
149 *(side_link_data_io[i][0]*linksize + side_link_data_io[i][1]*cmsize);
151 for(
int i = 0;i < 2; i++){
152 total_io += num_calls_all_link[i]
153 *(all_link_data_io[i][0]*linksize + all_link_data_io[i][1]*cmsize);
158 double total_flops = 0;
159 for(
int i = 0;i < 6; i++){
160 total_flops += num_calls_middle_link[i]
161 *(middle_link_data_flops[i][0]*matrix_mul_flops + middle_link_data_flops[i][1]*matrix_add_flops);
164 for(
int i = 0;i < 2; i++){
165 total_flops += num_calls_side_link[i]
166 *(side_link_data_flops[i][0]*matrix_mul_flops + side_link_data_flops[i][1]*matrix_add_flops);
168 for(
int i = 0;i < 2; i++){
169 total_flops += num_calls_all_link[i]
170 *(all_link_data_flops[i][0]*matrix_mul_flops + all_link_data_flops[i][1]*matrix_add_flops);
175 *flops = total_flops;
177 printfQuda(
"flop/byte =%.1f\n", total_flops/total_io);
188 qudaGaugeParam.
X[0] =
xdim;
189 qudaGaugeParam.
X[1] =
ydim;
190 qudaGaugeParam.
X[2] =
zdim;
191 qudaGaugeParam.
X[3] =
tdim;
205 memcpy(&qudaGaugeParam_ex, &qudaGaugeParam,
sizeof(
QudaGaugeParam));
206 qudaGaugeParam_ex.
X[0] = qudaGaugeParam.
X[0] + 4;
207 qudaGaugeParam_ex.
X[1] = qudaGaugeParam.
X[1] + 4;
208 qudaGaugeParam_ex.
X[2] = qudaGaugeParam.
X[2] + 4;
209 qudaGaugeParam_ex.
X[3] = qudaGaugeParam.
X[3] + 4;
227 int gSize = qudaGaugeParam.
cpu_prec;
229 for(
int i=0;i < 4;i++){
232 errorQuda(
"ERROR: cudaMallocHost failed for sitelink_2d\n");
235 errorQuda(
"ERROR: cudaMallocHost failed for sitelink_ex_2d\n");
242 errorQuda(
"malloc failed for siteLink_2d/siteLink_ex_2d\n");
258 for(
int i=0; i <
V_ex; i++){
276 if( x1< 2 || x1 >= X1 +2
277 || x2< 2 || x2 >= X2 +2
278 || x3< 2 || x3 >= X3 +2
279 || x4< 2 || x4 >= X4 +2){
285 x1 = (x1 - 2 +
X1) % X1;
286 x2 = (x2 - 2 +
X2) % X2;
287 x3 = (x3 - 2 +
X3) % X3;
288 x4 = (x4 - 2 +
X4) % X4;
290 int idx = (x4*X3*X2*X1+x3*X2*X1+x2*X1+
x1)>>1;
294 for(
int dir= 0; dir < 4; dir++){
297 memcpy(dst+i*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize);
305 for(
int dir = 0; dir < 4; dir++){
306 for(
int i = 0;i <
V; i++){
309 memcpy(dst + (4*i+dir)*gaugeSiteSize*
link_prec, src + i*gaugeSiteSize*link_prec, gaugeSiteSize \
315 for(
int dir = 0; dir < 4; dir++){
316 for(
int i = 0;i <
V; i++){
319 memcpy(dst + (4*i+dir)*gaugeSiteSize*
link_prec, src + i*gaugeSiteSize*link_prec, gaugeSiteSize*link_prec);
323 for(
int dir=0;dir < 4; dir++){
326 memcpy(dst, src, V*gaugeSiteSize*
link_prec);
332 errorQuda(
"multi_gpu milc is not supported\n");
334 for(
int dir=0;dir < 4; dir++){
337 memcpy(dst, src, V_ex*gaugeSiteSize*
link_prec);
346 gParam_ex.precision =
prec;
401 fprintf(stderr,
"ERROR: malloc failed for hw\n");
424 for(
int i=0; i <
V_ex; i++){
433 int x1h = sid - za*
E1h;
438 int x1odd = (x2 + x3 + x4 +
oddBit) & 1;
439 int x1 = 2*x1h +
x1odd;
442 if( x1< 2 || x1 >= X1 +2
443 || x2< 2 || x2 >= X2 +2
444 || x3< 2 || x3 >= X3 +2
445 || x4< 2 || x4 >= X4 +2){
451 x1 = (x1 - 2 +
X1) % X1;
452 x2 = (x2 - 2 +
X2) % X2;
453 x3 = (x3 - 2 +
X3) % X3;
454 x4 = (x4 - 2 +
X4) % X4;
456 int idx = (x4*X3*X2*X1+x3*X2*X1+x2*X1+
x1)>>1;
460 for(
int dir= 0; dir < 4; dir++){
463 memcpy(dst+i*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize);
465 src = ((
char**)cpuLongLinkOprod->Gauge_p())[dir];
467 memcpy(dst+i*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize);
491 for(
int i = 0;i < 4; i++){
538 hisq_force_test(
void)
547 float act_path_coeff[6];
549 act_path_coeff[0] = 0.625000;
550 act_path_coeff[1] = -0.058479;
551 act_path_coeff[2] = -0.087719;
552 act_path_coeff[3] = 0.030778;
553 act_path_coeff[4] = -0.007200;
554 act_path_coeff[5] = -0.123113;
558 double d_act_path_coeff[6];
559 for(
int i=0; i<6; ++i){
560 d_act_path_coeff[i] = act_path_coeff[i];
567 int R[4] = {2, 2, 2, 2};
593 struct timeval ht0, ht1;
594 gettimeofday(&ht0, NULL);
610 gettimeofday(&ht1, NULL);
612 struct timeval t0, t1, t2, t3;
614 gettimeofday(&t0, NULL);
618 cudaDeviceSynchronize();
619 gettimeofday(&t1, NULL);
626 cudaDeviceSynchronize();
628 gettimeofday(&t2, NULL);
632 cudaDeviceSynchronize();
633 gettimeofday(&t1, NULL);
638 cudaDeviceSynchronize();
639 gettimeofday(&t2, NULL);
660 cudaDeviceSynchronize();
662 gettimeofday(&t3, NULL);
668 int accuracy_level = 3;
674 printfQuda(
"Test %s\n",(1 == res) ?
"PASSED" :
"FAILED");
680 float perf_flops = total_flops / (
TDIFF(t0, t1)) *1e-9;
681 float perf = total_io / (
TDIFF(t0, t1)) *1e-9;
682 printfQuda(
"Staples time: %.2f ms, perf = %.2f GFLOPS, achieved bandwidth= %.2f GB/s\n",
TDIFF(t0,t1)*1000, perf_flops, perf);
683 printfQuda(
"Staples time : %g ms\t LongLink time : %g ms\t Completion time : %g ms\n",
TDIFF(t0,t1)*1000,
TDIFF(t1,t2)*1000,
TDIFF(t2,t3)*1000);
684 printfQuda(
"Host time (half-wilson fermion force) : %g ms\n",
TDIFF(ht0, ht1)*1000);
688 return accuracy_level;
695 printfQuda(
"running the following fermion force computation test:\n");
697 printfQuda(
"link_precision link_reconstruct space_dim(x/y/z) T_dimension Gauge_order\n");
711 for (i =1;i < argc; i++){
717 if( strcmp(argv[i],
"--gauge-order") == 0){
722 if(strcmp(argv[i+1],
"milc") == 0){
724 }
else if(strcmp(argv[i+1],
"qdp") == 0){
727 fprintf(stderr,
"Error: unsupported gauge-field order\n");
734 fprintf(stderr,
"ERROR: Invalid option:%s\n", argv[i]);
740 errorQuda(
"Multi-gpu for milc order is not supported\n");
750 int accuracy_level = hisq_force_test();
755 if(accuracy_level >=3 ){
int main(int argc, char **argv)
cpuGaugeField * cpuForce_ex
enum QudaPrecision_s QudaPrecision
cudaGaugeField * cudaOprod_ex
void exchangeExtendedGhost(const int *R, bool no_comms_fill=false)
cudaGaugeField * cudaForce_ex
void saveCPUField(cpuGaugeField &, const QudaFieldLocation &) const
void hisqStaplesForceCPU(const double *path_coeff, const QudaGaugeParam ¶m, quda::cpuGaugeField &oprod, quda::cpuGaugeField &link, quda::cpuGaugeField *newOprod)
void createHwCPU(void *hw, QudaPrecision precision)
int process_command_line_option(int argc, char **argv, int *idx)
int gridsize_from_cmdline[]
void computeLinkOrderedOuterProduct(void *src, void *dst, QudaPrecision precision, int gauge_order)
void setPrecision(QudaPrecision precision)
const char * get_gauge_order_str(QudaGaugeFieldOrder order)
QudaGaugeFieldOrder gauge_order
int compare_floats(void *a, void *b, int len, double epsilon, QudaPrecision precision)
const char * get_prec_str(QudaPrecision prec)
void createSiteLinkCPU(void **link, QudaPrecision precision, int phase)
QudaPrecision cpu_hw_prec
void hisqCompleteForceCPU(const QudaGaugeParam ¶m, quda::cpuGaugeField &oprod, quda::cpuGaugeField &link, quda::cpuGaugeField *mom)
void setTuning(QudaTune tune)
void initQuda(int device)
cudaGaugeField * cudaLongLinkOprod
QudaReconstructType link_recon
const char * get_recon_str(QudaReconstructType recon)
void hisqCompleteForceCuda(const QudaGaugeParam ¶m, const cudaGaugeField &oprod, const cudaGaugeField &link, cudaGaugeField *force, long long *flops=NULL)
cpuGaugeField * cpuOprod_ex
QudaGaugeFieldOrder order
void loadCPUField(const cpuGaugeField &, const QudaFieldLocation &)
enum QudaGaugeFieldOrder_s QudaGaugeFieldOrder
void hisqLongLinkForceCPU(double coeff, const QudaGaugeParam ¶m, quda::cpuGaugeField &oprod, quda::cpuGaugeField &link, quda::cpuGaugeField *newOprod)
QudaReconstructType reconstruct
int strong_check_mom(void *momA, void *momB, int len, QudaPrecision prec)
void hisqLongLinkForceCuda(double coeff, const QudaGaugeParam ¶m, const cudaGaugeField &oprod, const cudaGaugeField &link, cudaGaugeField *newOprod, long long *flops=NULL)
void hisqStaplesForceCuda(const double path_coeff[6], const QudaGaugeParam ¶m, const cudaGaugeField &oprod, const cudaGaugeField &link, cudaGaugeField *newOprod, long long *flops=NULL)
cpuGaugeField * cpuLongLinkOprod_ex
cudaGaugeField * cudaGauge_ex
void * memset(void *s, int c, size_t n)
cpuGaugeField * cpuLongLinkOprod
enum QudaReconstructType_s QudaReconstructType
Main header file for the QUDA library.
cpuGaugeField * cpuGauge_ex
cudaGaugeField * cudaGauge
QudaGaugeFieldOrder gauge_order
QudaReconstructType reconstruct
QudaGaugeFieldOrder order
cudaGaugeField * cudaForce
cudaGaugeField * cudaOprod
cudaGaugeField * cudaLongLinkOprod_ex
void initComms(int argc, char **argv, const int *commDims)
void setVerbosity(const QudaVerbosity verbosity)
void total_staple_io_flops(QudaPrecision prec, QudaReconstructType recon, double *io, double *flops)